FEAT 3
Finite Element Analysis Toolbox
Loading...
Searching...
No Matches
cuda_util.cu
1// FEAT3: Finite Element Analysis Toolbox, Version 3
2// Copyright (C) 2010 by Stefan Turek & the FEAT group
3// FEAT3 is released under the GNU General Public License version 3,
4// see the file 'copyright.txt' in the top level directory for details.
5
6// includes, FEAT
7#include <kernel/base_header.hpp>
8#include <kernel/util/cuda_util.hpp>
9#include <kernel/util/half.hpp>
10#include <kernel/util/string.hpp>
11#include <kernel/util/assertion.hpp>
12#include "cuda_profiler_api.h"
13
14#ifdef FEAT_HAVE_CUDSS
15#include <kernel/solver/cudss.hpp>
16#endif
17
18using namespace FEAT;
19
20Index FEAT::Util::cuda_blocksize_misc = 256;
21Index FEAT::Util::cuda_blocksize_reduction = 256;
22Index FEAT::Util::cuda_blocksize_spmv = 256;
23Index FEAT::Util::cuda_blocksize_axpy = 256;
24Index FEAT::Util::cuda_blocksize_scalar_assembly = 256;
25Index FEAT::Util::cuda_blocksize_blocked_assembly = 128;
26Index FEAT::Util::cuda_blocksize_vanka_assembly = 64;
27
28int FEAT::Util::cuda_device_number = 0;
29
30cusparseHandle_t FEAT::Util::Intern::cusparse_handle;
31cublasHandle_t FEAT::Util::Intern::cublas_handle;
32cublasLtMatmulAlgo_t * FEAT::Util::Intern::cublas_lt_algo_matmat;
33bool * FEAT::Util::Intern::cublas_lt_algo_matmat_initialized;
34size_t FEAT::Util::Intern::cuda_workspace_size;
35void * FEAT::Util::Intern::cuda_workspace;
36
37namespace FEAT
38{
39 namespace Util
40 {
41 namespace Intern
42 {
43 template <typename DT_>
44 __global__ void cuda_set_memory(DT_ * ptr, const DT_ val, const Index count)
45 {
46 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
47 if (idx >= count)
48 return;
49 ptr[idx] = val;
50 }
51
52 template <typename DT1_, typename DT2_>
53 __global__ void cuda_convert(DT1_ * dest, const DT2_ * src, const Index count)
54 {
55 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
56 if (idx >= count)
57 return;
58 dest[idx] = src[idx];
59 }
60 }
61 }
62}
63
64void FEAT::Util::cuda_set_device(const int device)
65{
66 if (cudaSuccess != cudaSetDevice(device))
67 throw InternalError(__func__, __FILE__, __LINE__, "cudaSetDevice failed!");
68}
69
70void FEAT::Util::cuda_check_last_error()
71{
72 if(Runtime::SyncGuard::enable_synchronize())
73 cudaDeviceSynchronize();
74 cudaError_t last_error(cudaGetLastError());
75 if (cudaSuccess != last_error)
76 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
77}
78
79void * FEAT::Util::cuda_get_device_pointer(void * host)
80{
81 void * device(nullptr);
82 if (cudaSuccess != cudaHostGetDevicePointer((void**)&device, host, 0))
83 throw InternalError(__func__, __FILE__, __LINE__, "cudaHostGetDevicePointer failed!");
84 return device;
85}
86
87void * FEAT::Util::cuda_malloc_managed(const Index bytes)
88{
89 void * memory(nullptr);
90 if (bytes == 0)
91 return memory;
92
93 auto status = cudaMallocManaged((void**)&memory, bytes);
94 if (status != cudaSuccess)
95 throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_malloc_managed allocation error\n" + stringify(cudaGetErrorString(status)));
96 if (memory == nullptr)
97 throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_malloc_managed allocation error (null pointer returned)");
98 return memory;
99}
100
101void * FEAT::Util::cuda_malloc(const Index bytes)
102{
103 void * memory(nullptr);
104 if(bytes == 0)
105 return memory;
106
107 auto status = cudaMalloc((void**)&memory, bytes);
108 if (status != cudaSuccess)
109 throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_malloc allocation error\n" + stringify(cudaGetErrorString(status)));
110 if (memory == nullptr)
111 throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_malloc allocation error (null pointer returned)");
112 return memory;
113}
114
115void * FEAT::Util::cuda_malloc_host(const Index bytes)
116{
117 void * memory(nullptr);
118 if(bytes == 0)
119 return memory;
120
121 auto status = cudaMallocHost((void**)&memory, bytes);
122 if (status != cudaSuccess)
123 throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_malloc allocation error\n" + stringify(cudaGetErrorString(status)));
124 if (memory == nullptr)
125 throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_malloc allocation error (null pointer returned)");
126 return memory;
127}
128
129void * FEAT::Util::cuda_get_static_memory(const Index bytes)
130{
131 if(Intern::cuda_workspace_size < bytes)
132 {
133 cudaFree(Intern::cuda_workspace);
134 auto status = cudaMalloc(&Intern::cuda_workspace, bytes);
135 if (status != cudaSuccess)
136 throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_get_static_memory allocation error\n" + stringify(cudaGetErrorString(status)));
137 if (Intern::cuda_workspace == nullptr)
138 throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_get_static_memory allocation error (null pointer returned)");
139 Intern::cuda_workspace_size = bytes;
140 }
141 return Intern::cuda_workspace;
142}
143
144void FEAT::Util::cuda_free_static_memory()
145{
146 if (Intern::cuda_workspace == nullptr)
147 return;
148
149 auto status = cudaFree(Intern::cuda_workspace);
150 if (cudaSuccess != status)
151 throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_free_static_memory: cudaFree failed!\n" + stringify(cudaGetErrorString(status)));
152 Intern::cuda_workspace_size = size_t(0u);
153 Intern::cuda_workspace = nullptr;
154}
155
156void FEAT::Util::cuda_free(void * address)
157{
158 if (address == nullptr)
159 return;
160
161 auto status = cudaFree(address);
162 if (cudaSuccess != status)
163 throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_free: cudaFree failed!\n" + stringify(cudaGetErrorString(status)));
164}
165
166void FEAT::Util::cuda_free_host(void * address)
167{
168 if (address == nullptr)
169 return;
170
171 auto status = cudaFreeHost(address);
172 if (cudaSuccess != status)
173 throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_free_host: cudaFreeHost failed!\n" + stringify(cudaGetErrorString(status)));
174}
175
176void FEAT::Util::cuda_initialize(int rank, int /*ranks_per_node*/, int /*ranks_per_uma*/, int gpus_per_node)
177{
178 /// \todo enable non cuda ranks and ensure balance of ranks per numa section
179 FEAT::Util::cuda_device_number = rank % gpus_per_node;
180 if (cudaSuccess != cudaSetDevice(cuda_device_number))
181 throw InternalError(__func__, __FILE__, __LINE__, "cudaSetDevice failed!");
182
183 int mm_support = 0;
184 if (cudaSuccess != cudaDeviceGetAttribute(&mm_support, cudaDevAttrManagedMemory, cuda_device_number))
185 throw InternalError(__func__, __FILE__, __LINE__, "cudaGetAttribute failed!");
186 XASSERTM(mm_support == 1, "selected cuda device does not support managed memory!");
187
188 if (CUBLAS_STATUS_SUCCESS != cublasCreate(&Util::Intern::cublas_handle))
189 throw InternalError(__func__, __FILE__, __LINE__, "cublasCreate failed!");
190 if (CUSPARSE_STATUS_SUCCESS != cusparseCreate(&Util::Intern::cusparse_handle))
191 throw InternalError(__func__, __FILE__, __LINE__, "cusparseCreate failed!");
192 if (CUBLAS_STATUS_SUCCESS != cublasSetPointerMode(Util::Intern::cublas_handle, CUBLAS_POINTER_MODE_HOST))
193 throw InternalError(__func__, __FILE__, __LINE__, "cublasSetPointerMode failed!");
194 if (CUSPARSE_STATUS_SUCCESS != cusparseSetPointerMode(Util::Intern::cusparse_handle, CUSPARSE_POINTER_MODE_HOST))
195 throw InternalError(__func__, __FILE__, __LINE__, "cusparseSetPointerMode failed!");
196
197 if (CUBLAS_STATUS_SUCCESS != cublasSetMathMode(Util::Intern::cublas_handle, CUBLAS_TF32_TENSOR_OP_MATH))
198 throw InternalError(__func__, __FILE__, __LINE__, "cublasSetMathMode failed!");
199
200 Util::Intern::cublas_lt_algo_matmat = new cublasLtMatmulAlgo_t[6];
201 Util::Intern::cublas_lt_algo_matmat_initialized = new bool[6];
202 for (int i(0) ; i < 6 ; ++i)
203 {
204 Util::Intern::cublas_lt_algo_matmat_initialized[i] = false;
205 }
206
207 //Util::Intern::cuda_workspace_size = 1024ul * 1024ul * 1024ul * 2ul;
208 Util::Intern::cuda_workspace_size = 0;
209 /*auto status = cudaMalloc(&(Util::Intern::cuda_workspace), Util::Intern::cuda_workspace_size);
210 if (status != cudaSuccess)
211 throw InternalError(__func__, __FILE__, __LINE__, "cudaMalloc failed: " + stringify(cudaGetErrorString(status)));*/
212}
213
214void FEAT::Util::cuda_finalize()
215{
216 if (cudaSuccess != cudaDeviceSynchronize())
217 throw InternalError(__func__, __FILE__, __LINE__, "cudaDeviceSynchronize failed!");
218
219 if (CUBLAS_STATUS_SUCCESS != cublasDestroy(Util::Intern::cublas_handle))
220 throw InternalError(__func__, __FILE__, __LINE__, "cublasDestroy failed!");
221 if (CUSPARSE_STATUS_SUCCESS != cusparseDestroy(Util::Intern::cusparse_handle))
222 throw InternalError(__func__, __FILE__, __LINE__, "cusparseDestroy failed!");
223
224 delete[] Util::Intern::cublas_lt_algo_matmat;
225 delete[] Util::Intern::cublas_lt_algo_matmat_initialized;
226
227 cuda_free_static_memory();
228
229 cudaError_t last_error(cudaGetLastError());
230 if (cudaSuccess != last_error)
231 throw InternalError(__func__, __FILE__, __LINE__, "Pending cuda errors occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
232}
233
234void FEAT::Util::cuda_synchronize()
235{
236 if(Runtime::SyncGuard::enable_synchronize())
237 {
238 auto status = cudaDeviceSynchronize();
239 if (status != cudaSuccess)
240 throw InternalError(__func__, __FILE__, __LINE__, "cudaDeviceSynchronize failed: " + stringify(cudaGetErrorString(status)));
241 }
242}
243
244void FEAT::Util::cuda_force_synchronize()
245{
246 auto status = cudaDeviceSynchronize();
247 if (status != cudaSuccess)
248 throw InternalError(__func__, __FILE__, __LINE__, "cudaDeviceSynchronize failed: " + stringify(cudaGetErrorString(status)));
249}
250
251void FEAT::Util::cuda_reset_device()
252{
253 auto status = cudaDeviceReset();
254 if (status != cudaSuccess)
255 throw InternalError(__func__, __FILE__, __LINE__, "cudaDeviceReset failed: " + stringify(cudaGetErrorString(status)));
256}
257
258void FEAT::Util::cuda_copy(void * dest, const void * src, const Index bytes)
259{
260 auto status = cudaMemcpy(dest, src, bytes, cudaMemcpyDefault);
261 if (status != cudaSuccess)
262 throw InternalError(__func__, __FILE__, __LINE__, "cudaMemcpy failed: " + stringify(cudaGetErrorString(status)));
263}
264
265void FEAT::Util::cuda_copy_host_to_device(void * dest, const void * src, const Index bytes)
266{
267 auto status = cudaMemcpy(dest, src, bytes, cudaMemcpyHostToDevice);
268 if (status != cudaSuccess)
269 throw InternalError(__func__, __FILE__, __LINE__, "cudaMemcpy failed: " + stringify(cudaGetErrorString(status)));
270}
271
272void FEAT::Util::cuda_copy_device_to_host(void * dest, const void * src, const Index bytes)
273{
274 auto status = cudaMemcpy(dest, src, bytes, cudaMemcpyDeviceToHost);
275 if (status != cudaSuccess)
276 throw InternalError(__func__, __FILE__, __LINE__, "cudaMemcpy failed: " + stringify(cudaGetErrorString(status)));
277}
278
279void FEAT::Util::cuda_copy_device_to_device(void * dest, const void * src, const Index bytes)
280{
281 auto status = cudaMemcpy(dest, src, bytes, cudaMemcpyDeviceToDevice);
282 if (status != cudaSuccess)
283 throw InternalError(__func__, __FILE__, __LINE__, "cudaMemcpy failed: " + stringify(cudaGetErrorString(status)));
284}
285
286void FEAT::Util::cuda_set_blocksize(Index misc, Index reduction, Index spmv, Index axpy, Index scalar_assembly, Index blocked_assembly)
287{
288 FEAT::Util::cuda_blocksize_misc = misc;
289
290 FEAT::Util::cuda_blocksize_reduction = reduction;
291
292 FEAT::Util::cuda_blocksize_spmv = spmv;
293
294 FEAT::Util::cuda_blocksize_axpy = axpy;
295
296 FEAT::Util::cuda_blocksize_scalar_assembly = scalar_assembly;
297
298 FEAT::Util::cuda_blocksize_blocked_assembly = blocked_assembly;
299}
300
301void FEAT::Util::cuda_reset_algos()
302{
303 for (int i(0) ; i < 6 ; ++i)
304 {
305 Util::Intern::cublas_lt_algo_matmat_initialized[i] = false;
306 }
307}
308
309template <typename DT_>
310void FEAT::Util::cuda_set_memory(DT_ * address, const DT_ val, const Index count)
311{
312 Index blocksize = FEAT::Util::cuda_blocksize_misc;
313 dim3 grid;
314 dim3 block;
315 block.x = (unsigned)blocksize;
316 grid.x = (unsigned)ceil((count)/(double)(block.x));
317 FEAT::Util::Intern::cuda_set_memory<<<grid, block>>>(address, val, count);
318
319 cudaDeviceSynchronize();
320#ifdef FEAT_DEBUG_MODE
321 cudaError_t last_error(cudaGetLastError());
322 if (cudaSuccess != last_error)
323 throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_set_memory failed!\n" + stringify(cudaGetErrorString(last_error)));
324#endif
325}
326#ifdef FEAT_HAVE_HALFMATH
327template void FEAT::Util::cuda_set_memory(Half * , const Half, const Index);
328#endif
329template void FEAT::Util::cuda_set_memory(float * , const float, const Index);
330template void FEAT::Util::cuda_set_memory(double * , const double, const Index);
331template void FEAT::Util::cuda_set_memory(unsigned int * , const unsigned int, const Index);
332template void FEAT::Util::cuda_set_memory(unsigned long * , const unsigned long, const Index);
333template void FEAT::Util::cuda_set_memory(unsigned long long * , const unsigned long long, const Index);
334template void FEAT::Util::cuda_set_memory(int * , const int, const Index);
335template void FEAT::Util::cuda_set_memory(long * , const long, const Index);
336template void FEAT::Util::cuda_set_memory(long long * , const long long, const Index);
337
338template <typename DT1_, typename DT2_>
339void FEAT::Util::cuda_convert(DT1_ * dest, const DT2_ * src, const Index count)
340{
341 Index blocksize = FEAT::Util::cuda_blocksize_misc;
342 dim3 grid;
343 dim3 block;
344 block.x = (unsigned)blocksize;
345 grid.x = (unsigned)ceil((count)/(double)(block.x));
346 FEAT::Util::Intern::cuda_convert<<<grid, block>>>(dest, src, count);
347
348 cudaDeviceSynchronize();
349#ifdef FEAT_DEBUG
350 cudaError_t last_error(cudaGetLastError());
351 if (cudaSuccess != last_error)
352 throw InternalError(__func__, __FILE__, __LINE__, "Util::cuda_convert failed!\n" + stringify(cudaGetErrorString(last_error)));
353#endif
354}
355#ifdef FEAT_HAVE_HALFMATH
356template void FEAT::Util::cuda_convert<Half, float>(Half *, const float *, const Index);
357template void FEAT::Util::cuda_convert<float, Half>(float *, const Half *, const Index);
358template void FEAT::Util::cuda_convert<Half, double>(Half *, const double *, const Index);
359template void FEAT::Util::cuda_convert<double, Half>(double *, const Half *, const Index);
360#endif
361template void FEAT::Util::cuda_convert<float, double>(float *, const double *, const Index);
362template void FEAT::Util::cuda_convert<double, float>(double *, const float *, const Index);
363template void FEAT::Util::cuda_convert<unsigned int, unsigned long>(unsigned int *, const unsigned long *, const Index);
364template void FEAT::Util::cuda_convert<unsigned int, unsigned long long>(unsigned int *, const unsigned long long *, const Index);
365template void FEAT::Util::cuda_convert<unsigned long, unsigned int>(unsigned long *, const unsigned int *, const Index);
366template void FEAT::Util::cuda_convert<unsigned long, unsigned long long>(unsigned long *, const unsigned long long *, const Index);
367template void FEAT::Util::cuda_convert<unsigned long long, unsigned int>(unsigned long long *, const unsigned int *, const Index);
368template void FEAT::Util::cuda_convert<unsigned long long, unsigned long>(unsigned long long *, const unsigned long *, const Index);
369template void FEAT::Util::cuda_convert<unsigned int, double>(unsigned int *, const double *, const Index);
370template void FEAT::Util::cuda_convert<unsigned long, double>(unsigned long *, const double *, const Index);
371template void FEAT::Util::cuda_convert<unsigned int, float>(unsigned int *, const float *, const Index);
372template void FEAT::Util::cuda_convert<unsigned long, float>(unsigned long *, const float *, const Index);
373
374int FEAT::Util::cuda_get_device_count()
375{
376 int numDevices(-1);
377 if (cudaSuccess != cudaGetDeviceCount(&numDevices))
378 throw InternalError(__func__, __FILE__, __LINE__, "cudaGetDeviceCount failed!");
379 return numDevices;
380}
381
382int FEAT::Util::cuda_get_device_id()
383{
384 int device(-1);
385 if (cudaSuccess != cudaGetDevice(&device))
386 throw InternalError(__func__, __FILE__, __LINE__, "cudaGetDevice failed!");
387 return device;
388}
389
390String FEAT::Util::cuda_get_visible_devices()
391{
392 String result("");
393 int numDevices(-1);
394 if (cudaSuccess != cudaGetDeviceCount(&numDevices))
395 throw InternalError(__func__, __FILE__, __LINE__, "cudaGetDeviceCount failed!");
396 result += "Number of visible cuda devices: " + stringify(numDevices) + "\n" ;
397
398 for (int idevice(0); idevice<numDevices; ++idevice)
399 {
400 // get device properties
401 cudaDeviceProp prop;
402 if (cudaSuccess != cudaGetDeviceProperties (&prop, idevice))
403 throw InternalError(__func__, __FILE__, __LINE__, "cudaGetDeviceProperties failed!");
404 // print out device name and compute capabilities
405 result += "Device " + stringify(idevice) + ": " + stringify(prop.name) + "\n";
406 }
407 return result;
408}
409
410std::size_t Util::cuda_get_max_cache_thread()
411{
412 std::size_t value = 0;
413 auto status = cudaDeviceGetLimit(&value, cudaLimit::cudaLimitStackSize);
414 if(cudaSuccess != status)
415 throw InternalError(__func__, __FILE__, __LINE__, "cudaDeviceSetLimit failed!");
416 return value;
417}
418
419void FEAT::Util::cuda_set_max_cache_thread(const std::size_t bytes)
420{
421 std::size_t value = bytes;
422 auto status = cudaDeviceSetLimit(cudaLimit::cudaLimitStackSize, value);
423 if(cudaSuccess != status)
424 throw InternalError(__func__, __FILE__, __LINE__, "cudaDeviceSetLimit failed!");
425 status = cudaDeviceGetLimit(&value, cudaLimit::cudaLimitStackSize);
426 if(cudaSuccess != status)
427 throw InternalError(__func__, __FILE__, __LINE__, "cudaDeviceGetLimit failed!");
428 if(value != bytes)
429 XABORTM("Could not set max cache per thread (" +stringify(value)+") to expected amount(" + stringify(bytes) + ")");
430}
431
432void FEAT::Util::cuda_start_profiling()
433{
434 cudaProfilerStart();
435}
436
437void FEAT::Util::cuda_stop_profiling()
438{
439 cudaProfilerStop();
440}
441
442std::size_t FEAT::Util::cuda_get_shared_mem_per_sm()
443{
444 int max_shared_mem_sm = 0;
445 if(cudaDeviceGetAttribute(&max_shared_mem_sm, cudaDevAttrMaxSharedMemoryPerMultiprocessor, Util::cuda_device_number) != cudaSuccess)
446 {
447 throw InternalError(__func__, __FILE__, __LINE__, "cudaDeviceGetAttribute failed!");
448 }
449 return std::size_t(max_shared_mem_sm);
450
451}
452
453std::size_t FEAT::Util::cuda_get_max_blocks_per_sm()
454{
455 int max_blocks_per_sm = 0;
456 if(cudaDeviceGetAttribute(&max_blocks_per_sm, cudaDevAttrMaxBlocksPerMultiprocessor, Util::cuda_device_number) != cudaSuccess)
457 {
458 throw InternalError(__func__, __FILE__, __LINE__, "cudaDeviceGetAttribute failed!");
459 }
460 return std::size_t(max_blocks_per_sm);
461
462}
463
464std::size_t FEAT::Util::cuda_get_sm_count()
465{
466 int max_sm_per_device = 0;
467 if(cudaDeviceGetAttribute(&max_sm_per_device, cudaDevAttrMultiProcessorCount, Util::cuda_device_number) != cudaSuccess)
468 {
469 throw InternalError(__func__, __FILE__, __LINE__, "cudaDeviceGetAttribute failed!");
470 }
471 return std::size_t(max_sm_per_device);
472
473}