12#include <kernel/util/string.hpp>
16#include <cusparse_v2.h>
29 extern cusparseHandle_t cusparse_handle;
30 extern cublasHandle_t cublas_handle;
31 extern cublasLtMatmulAlgo_t * cublas_lt_algo_matmat;
32 extern bool * cublas_lt_algo_matmat_initialized;
33 extern size_t cuda_workspace_size;
34 extern void * cuda_workspace;
40 extern int cuda_device_number;
43 extern Index cuda_blocksize_misc;
46 extern Index cuda_blocksize_reduction;
49 extern Index cuda_blocksize_spmv;
52 extern Index cuda_blocksize_axpy;
55 extern Index cuda_blocksize_scalar_assembly;
58 extern Index cuda_blocksize_blocked_assembly;
61 extern Index cuda_blocksize_vanka_assembly;
64 void cuda_set_device(
const int device);
65 void cuda_check_last_error();
66 void * cuda_get_device_pointer(
void * host);
67 void * cuda_malloc_managed(
const Index bytes);
68 void * cuda_malloc(
const Index bytes);
69 void * cuda_malloc_host(
const Index bytes);
70 void * cuda_get_static_memory(
const Index bytes);
71 void cuda_free(
void * address);
72 void cuda_free_host(
void * address);
73 void cuda_free_static_memory();
74 void cuda_initialize(
int rank,
int ranks_per_node,
int ranks_per_uma,
int gpus_per_node);
76 NOINLINE
void cuda_synchronize();
77 NOINLINE
void cuda_force_synchronize();
78 void cuda_reset_device();
79 void cuda_copy(
void * dest,
const void * src,
const Index bytes);
80 void cuda_copy_host_to_device(
void * dest,
const void * src,
const Index bytes);
81 void cuda_copy_device_to_host(
void * dest,
const void * src,
const Index bytes);
82 void cuda_copy_device_to_device(
void * dest,
const void * src,
const Index bytes);
83 void cuda_reset_algos();
84 template <
typename DT_>
85 void cuda_set_memory(DT_ * address,
const DT_ val,
const Index count);
86 template <
typename DT1_,
typename DT2_>
87 void cuda_convert(DT1_ * dest,
const DT2_ * src,
const Index count);
88 int cuda_get_device_count();
89 int cuda_get_device_id();
90 String cuda_get_visible_devices();
91 std::size_t cuda_get_max_cache_thread();
92 void cuda_set_max_cache_thread(
const std::size_t bytes);
93 void cuda_start_profiling();
94 void cuda_stop_profiling();
95 std::size_t cuda_get_shared_mem_per_sm();
96 std::size_t cuda_get_max_blocks_per_sm();
97 std::size_t cuda_get_sm_count();
104 inline int cuda_get_occupancy(T kernel_func,
int blocksize,
int shared_memory = 0)
107 if(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, kernel_func, blocksize, shared_memory) != cudaSuccess)
109 throw InternalError(__func__, __FILE__, __LINE__,
"cudaOccupancyMaxActiveBlockPerMultiprocessor failed!");
std::uint64_t Index
Index data type.