8#include <kernel/backend.hpp>
9#include <kernel/util/memory_pool.hpp>
10#include <kernel/util/cuda_util.hpp>
11#include <kernel/solver/direct_sparse_solver.hpp>
25 static_assert(
sizeof(CUDSS_IT) == 4,
"DirectSparseSolver: cuDSS: index type size mismatch!");
42 cudssMatrix_t vec_sol;
43 cudssMatrix_t vec_rhs;
46 std::int64_t memory_estimates[16];
49 std::int64_t num_global_dofs, dof_offset, num_owned_dofs, num_owned_nzes, num_global_nzes, num_rhs;
52 std::vector<CUDSS_IT> row_ptr_host, col_idx_host;
53 std::vector<CUDSS_DT> mat_val_host, rhs_val_host, sol_val_host;
56 void *row_ptr_dev, *col_idx_dev;
57 void *mat_val_dev, *rhs_val_dev, *sol_val_dev;
59 explicit CUDSS_Core(
const Dist::Comm& comm, Index num_global_dofs_, Index dof_offset_,
60 Index num_owned_dofs_, Index num_owned_nzes_, Index num_global_nzes_) :
62 mpi_comm(comm.mpi_comm()),
64 handle(reinterpret_cast<cudssHandle_t>(Runtime::get_cudss_handle())),
70 num_global_dofs(std::uint32_t(num_global_dofs_)),
71 dof_offset(std::uint32_t(dof_offset_)),
72 num_owned_dofs(std::uint32_t(num_owned_dofs_)),
73 num_owned_nzes(std::uint32_t(num_owned_nzes_)),
74 num_global_nzes(std::uint32_t(num_global_nzes_)),
90 XASSERTM(handle !=
nullptr,
"Failed to retrieve cuDSS handle!");
92 if(CUDSS_STATUS_SUCCESS != cudssConfigCreate(&config))
93 throw InternalError(__func__, __FILE__, __LINE__,
"cudssConfigCreate failed!");
95 if(CUDSS_STATUS_SUCCESS != cudssDataCreate(handle, &data))
96 throw InternalError(__func__, __FILE__, __LINE__,
"cudssDataCreate failed!");
99 if(CUDSS_STATUS_SUCCESS != cudssDataSet(handle, data, CUDSS_DATA_COMM, &mpi_comm,
sizeof(MPI_Comm*)))
100 throw InternalError(__func__, __FILE__, __LINE__,
"cudssDataSet for 'CUDSS_DATA_COMM' failed!");
104 memset(memory_estimates, 0,
sizeof(memory_estimates));
107 row_ptr_host.resize(std::size_t(num_owned_dofs+1), 0u);
108 col_idx_host.resize(std::size_t(num_owned_nzes), 0u);
109 mat_val_host.resize(std::size_t(num_owned_nzes), 0.0);
110 rhs_val_host.resize(std::size_t(num_owned_dofs * num_rhs), 0.0);
111 sol_val_host.resize(std::size_t(num_owned_dofs * num_rhs), 0.0);
114 row_ptr_dev = Util::cuda_malloc(
sizeof(CUDSS_IT) * std::size_t(num_owned_dofs+1));
115 col_idx_dev = Util::cuda_malloc(
sizeof(CUDSS_IT) * std::size_t(num_owned_nzes));
116 mat_val_dev = Util::cuda_malloc(
sizeof(CUDSS_DT) * std::size_t(num_owned_nzes));
117 rhs_val_dev = Util::cuda_malloc(
sizeof(CUDSS_DT) * std::size_t(num_owned_dofs * num_rhs));
118 sol_val_dev = Util::cuda_malloc(
sizeof(CUDSS_DT) * std::size_t(num_owned_dofs * num_rhs));
124 cudssMatrixDestroy(vec_sol);
126 cudssMatrixDestroy(vec_rhs);
128 cudssMatrixDestroy(matrix);
131 Util::cuda_free(sol_val_dev);
133 Util::cuda_free(rhs_val_dev);
135 Util::cuda_free(mat_val_dev);
137 Util::cuda_free(col_idx_dev);
139 Util::cuda_free(row_ptr_dev);
141 cudssDataDestroy(handle, data);
143 cudssConfigDestroy(config);
146 Util::cuda_synchronize();
151 cudssStatus_t ret = CUDSS_STATUS_INTERNAL_ERROR;
154 const std::int64_t last_owned_dof = dof_offset + num_owned_dofs - 1;
157 Util::cuda_copy_host_to_device(row_ptr_dev, row_ptr_host.data(),
sizeof(CUDSS_IT) * std::size_t(num_owned_dofs+1));
158 Util::cuda_copy_host_to_device(col_idx_dev, col_idx_host.data(),
sizeof(CUDSS_IT) * std::size_t(num_owned_nzes));
167 ret = cudssMatrixCreateCsr(
181 if(ret != CUDSS_STATUS_SUCCESS)
182 throw DirectSparseSolverException(
"cuDSS",
"cudssMatrixCreateCsr() for system matrix failed!");
185 ret = cudssMatrixSetDistributionRow1d(matrix, dof_offset, last_owned_dof);
186 if(ret != CUDSS_STATUS_SUCCESS)
187 throw DirectSparseSolverException(
"cuDSS",
"cudssMatrixSetDistributionRow1d() for solution vector failed!");
190 ret = cudssMatrixCreateDn(
197 CUDSS_LAYOUT_COL_MAJOR);
198 if(ret != CUDSS_STATUS_SUCCESS)
199 throw DirectSparseSolverException(
"cuDSS",
"cudssMatrixCreateDn() for solution vector failed!");
202 ret = cudssMatrixSetDistributionRow1d(vec_sol, dof_offset, last_owned_dof);
203 if(ret != CUDSS_STATUS_SUCCESS)
204 throw DirectSparseSolverException(
"cuDSS",
"cudssMatrixSetDistributionRow1d() for solution vector failed!");
207 ret = cudssMatrixCreateDn(
214 CUDSS_LAYOUT_COL_MAJOR);
215 if(ret != CUDSS_STATUS_SUCCESS)
216 throw DirectSparseSolverException(
"cuDSS",
"cudssMatrixCreateDn() for rhs vector failed!");
219 ret = cudssMatrixSetDistributionRow1d(vec_rhs, dof_offset, last_owned_dof);
220 if(ret != CUDSS_STATUS_SUCCESS)
221 throw DirectSparseSolverException(
"cuDSS",
"cudssMatrixSetDistributionRow1d() for rhs vector failed!");
226 CUDSS_PHASE_ANALYSIS,
232 if(ret != CUDSS_STATUS_SUCCESS)
233 throw DirectSparseSolverException(
"cuDSS",
"cudssExecute() for phase 'CUDSS_PHASE_ANALYSIS' failed!");
236 std::size_t bytes_written(0u);
240 CUDSS_DATA_MEMORY_ESTIMATES,
242 sizeof(memory_estimates),
244 if(ret != CUDSS_STATUS_SUCCESS)
245 throw DirectSparseSolverException(
"cuDSS",
"cudssDataGet() for 'CUDSS_DATA_MEMORY_ESTIMATES' failed!");
248 Util::cuda_synchronize();
254 Util::cuda_copy_host_to_device(mat_val_dev, mat_val_host.data(),
sizeof(CUDSS_DT) * std::size_t(num_owned_nzes));
257 cudssStatus_t ret = cudssExecute(
259 CUDSS_PHASE_FACTORIZATION,
265 if(ret != CUDSS_STATUS_SUCCESS)
266 throw DirectSparseSolverException(
"cuDSS",
"cudssExecute() for phase 'CUDSS_PHASE_FACTORIZATION' failed!");
269 Util::cuda_synchronize();
274 cudssStatus_t ret = CUDSS_STATUS_INTERNAL_ERROR;
277 Util::cuda_copy_host_to_device(rhs_val_dev, rhs_val_host.data(),
sizeof(CUDSS_DT) * std::size_t(num_owned_dofs * num_rhs));
280 ret = cudssMatrixSetValues(vec_sol, sol_val_dev);
281 if(ret != CUDSS_STATUS_SUCCESS)
282 throw DirectSparseSolverException(
"cuDSS",
"cudssMatrixSetValues() failed for vec_sol!");
285 cudssMatrixSetValues(vec_rhs, rhs_val_dev);
286 if(ret != CUDSS_STATUS_SUCCESS)
287 throw DirectSparseSolverException(
"cuDSS",
"cudssMatrixSetValues() failed for vec_rhs!");
298 if(ret != CUDSS_STATUS_SUCCESS)
299 throw DirectSparseSolverException(
"cuDSS",
"cudssExecute() for phase 'CUDSS_PHASE_SOLVE' failed!");
302 Util::cuda_copy_device_to_host(sol_val_host.data(), sol_val_dev,
sizeof(CUDSS_DT) * std::size_t(num_owned_dofs * num_rhs));
305 Util::cuda_synchronize();
308 std::int64_t get_peak_mem_device()
const
310 return memory_estimates[1];
313 std::int64_t get_peak_mem_host()
const
315 return memory_estimates[3];
319 void* create_cudss_core(
const Dist::Comm* comm, Index num_global_dofs, Index dof_offset,
320 Index num_owned_dofs, Index num_owned_nzes, Index num_global_nzes)
322 return new CUDSS_Core(*comm, num_global_dofs, dof_offset, num_owned_dofs, num_owned_nzes, num_global_nzes);
325 void destroy_cudss_core(
void* core)
328 delete reinterpret_cast<CUDSS_Core*
>(core);
331 CUDSS_IT* get_cudss_row_ptr(
void* core)
334 return reinterpret_cast<CUDSS_Core*
>(core)->row_ptr_host.data();
337 CUDSS_IT* get_cudss_col_idx(
void* core)
340 return reinterpret_cast<CUDSS_Core*
>(core)->col_idx_host.data();
343 CUDSS_DT* get_cudss_mat_val(
void* core)
346 return reinterpret_cast<CUDSS_Core*
>(core)->mat_val_host.data();
349 CUDSS_DT* get_cudss_rhs_val(
void* core)
352 return reinterpret_cast<CUDSS_Core*
>(core)->rhs_val_host.data();
355 CUDSS_DT* get_cudss_sol_val(
void* core)
358 return reinterpret_cast<CUDSS_Core*
>(core)->sol_val_host.data();
361 void init_cudss_symbolic(
void* core)
364 reinterpret_cast<CUDSS_Core*
>(core)->init_symbolic();
367 void init_cudss_numeric(
void* core)
370 reinterpret_cast<CUDSS_Core*
>(core)->init_numeric();
373 void solve_cudss(
void* core)
376 reinterpret_cast<CUDSS_Core*
>(core)->
solve();
379 std::int64_t get_peak_mem_cudss_host(
void* core)
382 return reinterpret_cast<CUDSS_Core*
>(core)->get_peak_mem_host();
385 std::int64_t get_peak_mem_cudss_device(
void* core)
388 return reinterpret_cast<CUDSS_Core*
>(core)->get_peak_mem_device();
396void feat_direct_sparse_solver_cudss_dummy()
#define XASSERT(expr)
Assertion macro definition.
#define XASSERTM(expr, msg)
Assertion macro definition with custom message.
Status solve(SolverBase< Vector_ > &solver, Vector_ &vec_sol, const Vector_ &vec_rhs, const Matrix_ &matrix, const Filter_ &filter)
Solve linear system with initial solution guess.