12#include <kernel/util/string.hpp> 
   16#include <cusparse_v2.h> 
   29      extern cusparseHandle_t cusparse_handle;
 
   30      extern cublasHandle_t cublas_handle;
 
   31      extern cublasLtMatmulAlgo_t * cublas_lt_algo_matmat;
 
   32      extern bool * cublas_lt_algo_matmat_initialized;
 
   33      extern size_t cuda_workspace_size;
 
   34      extern void * cuda_workspace;
 
   40    extern int cuda_device_number;
 
   43    extern Index cuda_blocksize_misc;
 
   46    extern Index cuda_blocksize_reduction;
 
   49    extern Index cuda_blocksize_spmv;
 
   52    extern Index cuda_blocksize_axpy;
 
   55    extern Index cuda_blocksize_scalar_assembly;
 
   58    extern Index cuda_blocksize_blocked_assembly;
 
   61    extern Index cuda_blocksize_vanka_assembly;
 
   64    void cuda_set_device(
const int device);
 
   65    void cuda_check_last_error();
 
   66    void * cuda_get_device_pointer(
void * host);
 
   67    void * cuda_malloc_managed(
const Index bytes);
 
   68    void * cuda_malloc(
const Index bytes);
 
   69    void * cuda_malloc_host(
const Index bytes);
 
   70    void * cuda_get_static_memory(
const Index bytes);
 
   71    void cuda_free(
void * address);
 
   72    void cuda_free_host(
void * address);
 
   73    void cuda_free_static_memory();
 
   74    void cuda_initialize(
int rank, 
int ranks_per_node, 
int ranks_per_uma, 
int gpus_per_node);
 
   76    NOINLINE 
void cuda_synchronize();
 
   77    NOINLINE 
void cuda_force_synchronize();
 
   78    void cuda_reset_device();
 
   79    void cuda_copy(
void * dest, 
const void * src, 
const Index bytes);
 
   80    void cuda_copy_host_to_device(
void * dest, 
const void * src, 
const Index bytes);
 
   81    void cuda_copy_device_to_host(
void * dest, 
const void * src, 
const Index bytes);
 
   82    void cuda_copy_device_to_device(
void * dest, 
const void * src, 
const Index bytes);
 
   83    void cuda_reset_algos();
 
   84    template <
typename DT_>
 
   85    void cuda_set_memory(DT_ * address, 
const DT_ val, 
const Index count);
 
   86    template <
typename DT1_, 
typename DT2_>
 
   87    void cuda_convert(DT1_ * dest, 
const DT2_ * src, 
const Index count);
 
   88    int cuda_get_device_count();
 
   89    int cuda_get_device_id();
 
   90    String cuda_get_visible_devices();
 
   91    std::size_t cuda_get_max_cache_thread();
 
   92    void cuda_set_max_cache_thread(
const std::size_t bytes);
 
   93    void cuda_start_profiling();
 
   94    void cuda_stop_profiling();
 
   95    std::size_t cuda_get_shared_mem_per_sm();
 
   96    std::size_t cuda_get_max_blocks_per_sm();
 
   97    std::size_t cuda_get_sm_count();
 
  104    inline int cuda_get_occupancy(T kernel_func, 
int blocksize, 
int shared_memory = 0)
 
  107      if(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, kernel_func, blocksize, shared_memory) != cudaSuccess)
 
  109        throw InternalError(__func__, __FILE__, __LINE__, 
"cudaOccupancyMaxActiveBlockPerMultiprocessor failed!");
 
std::uint64_t Index
Index data type.