FEAT 3
Finite Element Analysis Toolbox
Loading...
Searching...
No Matches
cuda_util.hpp
1// FEAT3: Finite Element Analysis Toolbox, Version 3
2// Copyright (C) 2010 by Stefan Turek & the FEAT group
3// FEAT3 is released under the GNU General Public License version 3,
4// see the file 'copyright.txt' in the top level directory for details.
5
6#pragma once
7
8#ifdef FEAT_HAVE_CUDA
9
10// includes, FEAT
12#include <kernel/util/string.hpp>
14
15#ifdef __CUDACC__
16#include <cusparse_v2.h>
17#include <cublas_v2.h>
18#include <cublasLt.h>
19#endif
20
21namespace FEAT
22{
23 namespace Util
24 {
26 namespace Intern
27 {
28#ifdef __CUDACC__
29 extern cusparseHandle_t cusparse_handle;
30 extern cublasHandle_t cublas_handle;
31 extern cublasLtMatmulAlgo_t * cublas_lt_algo_matmat;
32 extern bool * cublas_lt_algo_matmat_initialized;
33 extern size_t cuda_workspace_size;
34 extern void * cuda_workspace;
35#endif
36 }
38
40 extern int cuda_device_number;
41
43 extern Index cuda_blocksize_misc;
44
46 extern Index cuda_blocksize_reduction;
47
49 extern Index cuda_blocksize_spmv;
50
52 extern Index cuda_blocksize_axpy;
53
55 extern Index cuda_blocksize_scalar_assembly;
56
58 extern Index cuda_blocksize_blocked_assembly;
59
61 extern Index cuda_blocksize_vanka_assembly;
62
63 void cuda_set_blocksize(Index misc, Index reduction, Index spmv, Index axpy, Index scalar_assembly, Index blocked_assembly);
64 void cuda_set_device(const int device);
65 void cuda_check_last_error();
66 void * cuda_get_device_pointer(void * host);
67 void * cuda_malloc_managed(const Index bytes);
68 void * cuda_malloc(const Index bytes);
69 void * cuda_malloc_host(const Index bytes);
70 void * cuda_get_static_memory(const Index bytes);
71 void cuda_free(void * address);
72 void cuda_free_host(void * address);
73 void cuda_free_static_memory();
74 void cuda_initialize(int rank, int ranks_per_node, int ranks_per_uma, int gpus_per_node);
75 void cuda_finalize();
76 NOINLINE void cuda_synchronize();
77 NOINLINE void cuda_force_synchronize();
78 void cuda_reset_device();
79 void cuda_copy(void * dest, const void * src, const Index bytes);
80 void cuda_copy_host_to_device(void * dest, const void * src, const Index bytes);
81 void cuda_copy_device_to_host(void * dest, const void * src, const Index bytes);
82 void cuda_copy_device_to_device(void * dest, const void * src, const Index bytes);
83 void cuda_reset_algos();
84 template <typename DT_>
85 void cuda_set_memory(DT_ * address, const DT_ val, const Index count);
86 template <typename DT1_, typename DT2_>
87 void cuda_convert(DT1_ * dest, const DT2_ * src, const Index count);
88 int cuda_get_device_count();
89 int cuda_get_device_id();
90 String cuda_get_visible_devices();
91 std::size_t cuda_get_max_cache_thread();
92 void cuda_set_max_cache_thread(const std::size_t bytes);
93 void cuda_start_profiling();
94 void cuda_stop_profiling();
95 std::size_t cuda_get_shared_mem_per_sm();
96 std::size_t cuda_get_max_blocks_per_sm();
97 std::size_t cuda_get_sm_count();
98
99 #ifdef __CUDACC__
103 template<typename T>
104 inline int cuda_get_occupancy(T kernel_func, int blocksize, int shared_memory = 0)
105 {
106 int num_blocks = 0;
107 if(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, kernel_func, blocksize, shared_memory) != cudaSuccess)
108 {
109 throw InternalError(__func__, __FILE__, __LINE__, "cudaOccupancyMaxActiveBlockPerMultiprocessor failed!");
110 }
111 return num_blocks;
112 }
113 #endif
114 }
115}
116#endif // FEAT_HAVE_CUDA
FEAT Kernel base header.
FEAT namespace.
Definition: adjactor.hpp:12
std::uint64_t Index
Index data type.