FEAT 3
Finite Element Analysis Toolbox
Loading...
Searching...
No Matches
apply.cu
1// FEAT3: Finite Element Analysis Toolbox, Version 3
2// Copyright (C) 2010 by Stefan Turek & the FEAT group
3// FEAT3 is released under the GNU General Public License version 3,
4// see the file 'copyright.txt' in the top level directory for details.
5
6// includes, FEAT
7#include <kernel/base_header.hpp>
8#include <kernel/lafem/arch/apply.hpp>
9#include <kernel/lafem/arch/component_product.hpp>
10#include <kernel/lafem/arch/scale.hpp>
11#include <kernel/util/exception.hpp>
12#include <kernel/util/memory_pool.hpp>
13#include <kernel/util/math.hpp>
14#include <kernel/util/half.hpp>
15
16#include "cusparse_v2.h"
17
18namespace FEAT
19{
20 namespace LAFEM
21 {
22 namespace Intern
23 {
24 template <typename DT_, typename IT_>
25 __global__ void cuda_apply_banded(DT_ * r, const DT_ alpha, const DT_ * x, const DT_ beta, const DT_ * val, const IT_ * offsets, const Index num_of_offsets, const Index rows, const Index columns)
26 {
27 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
28 if (idx >= rows)
29 return;
30
31 const Index k1(rows - 1);
32 const Index k2(rows + columns - 1);
33
34 Index start(0);
35
36 while (k1 > offsets[start] + idx)
37 {
38 ++start;
39 }
40
41 Index end(start);
42
43 while (end < num_of_offsets && idx + offsets[end] < k2)
44 {
45 ++end;
46 }
47
48 DT_ sum(DT_(0.0));
49 for (Index diag(start); diag < end; ++diag)
50 {
51 sum += val[rows * diag + idx] * x[idx + offsets[diag] - rows + 1];
52 }
53 r[idx] = (sum*alpha) + beta * r[idx];
54 }
55
56 void cusparse_apply_bcsr(cusparseDirection_t dir, cusparseOperation_t trans,
57 int m, int n, int nnz,
58 const float * alpha, const cusparseMatDescr_t descrA,
59 const float * csrVal, const int * csrRowPtr, const int *csrColInd,
60 int block_dim,
61 const float * x, const float * beta, float * y)
62 {
63 cusparseStatus_t status;
64 status = cusparseSbsrmv(Util::Intern::cusparse_handle, dir, trans, m, n, nnz, alpha, descrA, csrVal, csrRowPtr,
65 csrColInd, block_dim, x, beta, y);
66 if (status != CUSPARSE_STATUS_SUCCESS)
67 throw InternalError(__func__, __FILE__, __LINE__, "cusparsebsrmv failed with status code: " + stringify(status));
68 }
69
70 void cusparse_apply_bcsr(cusparseDirection_t dir, cusparseOperation_t trans,
71 int m, int n, int nnz,
72 const double * alpha, const cusparseMatDescr_t descrA,
73 const double * csrVal, const int * csrRowPtr, const int *csrColInd,
74 int block_dim,
75 const double * x, const double * beta, double * y)
76 {
77 cusparseStatus_t status;
78 status = cusparseDbsrmv(Util::Intern::cusparse_handle, dir, trans, m, n, nnz, alpha, descrA, csrVal, csrRowPtr,
79 csrColInd, block_dim, x, beta, y);
80 if (status != CUSPARSE_STATUS_SUCCESS)
81 throw InternalError(__func__, __FILE__, __LINE__, "cusparsebsrmv failed with status code: " + stringify(status));
82 }
83
84 void cublas_apply_dense(cublasOperation_t trans,
85 int m, int n,
86 const float * alpha,
87 const float * val,
88 const float * x, const float * beta, float * y)
89 {
90 cublasStatus_t status;
91 status = cublasSgemv(Util::Intern::cublas_handle, trans, n, m, alpha, val, n, x, 1, beta, y, 1);
92 if (status != CUBLAS_STATUS_SUCCESS)
93 throw InternalError(__func__, __FILE__, __LINE__, "cuda error: " + stringify(cublasGetStatusString(status)));
94 }
95
96 void cublas_apply_dense(cublasOperation_t trans,
97 int m, int n,
98 const double * alpha,
99 const double * val,
100 const double * x, const double * beta, double * y)
101 {
102 cublasStatus_t status;
103 status = cublasDgemv(Util::Intern::cublas_handle, trans, n, m, alpha, val, n, x, 1, beta, y, 1);
104 if (status != CUBLAS_STATUS_SUCCESS)
105 throw InternalError(__func__, __FILE__, __LINE__, "cuda error: " + stringify(cublasGetStatusString(status)));
106 }
107
108 template <int BlockSize_, typename DT_, typename IT_>
109 __global__ void cuda_apply_csrsb(DT_ * r, const DT_ a, const DT_ * x, const DT_ b, const DT_ * val, const IT_ * col_ind,
110 const IT_ * row_ptr, const Index count)
111 {
112 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
113 if (idx >= count)
114 return;
115
116 DT_ bsum[BlockSize_];
117 for (int j(0) ; j < BlockSize_ ; ++j)
118 {
119 bsum[j] = DT_(0);
120 }
121 const IT_ end(row_ptr[idx + 1]);
122 for (IT_ i(row_ptr[idx]) ; i < end ; ++i)
123 {
124 const DT_ vali(val[i]);
125 const IT_ coli(col_ind[i] * BlockSize_);
126 for (int j(0) ; j < BlockSize_ ; ++j)
127 {
128 bsum[j] += vali * x[coli + j];
129 }
130 }
131 for (int j(0) ; j < BlockSize_ ; ++j)
132 {
133 r[idx * BlockSize_ + j] = (bsum[j] * a) + b * r[idx * BlockSize_ + j];
134 }
135 }
136
137 template <typename DT_, typename IT_>
138 __global__ void cuda_apply_bcsr(DT_ * r, const DT_ a, const DT_ * x, const DT_ b, const DT_ * val, const IT_ * col_ind,
139 const IT_ * row_ptr, const Index count, const int BlockHeight, const int BlockWidth)
140 {
141 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
142 if (idx >= count)
143 return;
144
145 /// \todo remove hardcoded number
146 DT_ bsum[10];
147 for (int j(0) ; j < BlockHeight ; ++j)
148 {
149 bsum[j] = DT_(0);
150 }
151 const IT_ end(row_ptr[idx + 1]);
152 for (IT_ i(row_ptr[idx]) ; i < end ; ++i)
153 {
154 for (int h(0) ; h < BlockHeight ; ++h)
155 {
156 for (int w(0) ; w < BlockWidth ; ++w)
157 {
158 bsum[h] += val[i * BlockHeight * BlockWidth + h * BlockWidth + w] * x[col_ind[i] * BlockWidth + w];
159 }
160 }
161 }
162 for (int j(0) ; j < BlockHeight ; ++j)
163 {
164 r[idx * BlockHeight + j] = (bsum[j] * a) + b * r[idx * BlockHeight + j];
165 }
166 }
167 }
168 }
169}
170
171
172using namespace FEAT;
173using namespace FEAT::LAFEM;
174using namespace FEAT::LAFEM::Arch;
175
176template <typename DT_, typename IT_>
177void Apply::csr_cuda(DT_ * r, const DT_ a, const DT_ * const x, const DT_ b, const DT_ * const y, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows, const Index columns, const Index used_elements, const bool transposed)
178{
179 cusparseOperation_t trans;
180 if (transposed)
181 trans = CUSPARSE_OPERATION_TRANSPOSE;
182 else
183 trans = CUSPARSE_OPERATION_NON_TRANSPOSE;
184
185 cudaDataType dt;
186 cudaDataType ct; //compute type
187 if (typeid(DT_) == typeid(double))
188 {
189 dt = CUDA_R_64F;
190 ct = CUDA_R_64F;
191 }
192 else if (typeid(DT_) == typeid(float))
193 {
194 dt = CUDA_R_32F;
195 ct = CUDA_R_32F;
196 }
197#ifdef FEAT_HAVE_HALFMATH
198 else if (typeid(DT_) == typeid(Half))
199 {
200 dt = CUDA_R_16F;
201 ct = CUDA_R_32F; //cusparseSpMV does not support computation in half, yet
202 }
203#endif
204 else
205 throw InternalError(__func__, __FILE__, __LINE__, "unsupported data type!");
206
207 cusparseIndexType_t it;
208 if(sizeof(IT_) == 4u)
209 it = CUSPARSE_INDEX_32I;
210 else if(sizeof(IT_) == 8u)
211 it = CUSPARSE_INDEX_64I;
212 else
213 throw InternalError(__func__, __FILE__, __LINE__, "unsupported index type!");
214
215 cusparseStatus_t status;
216
217 cusparseSpMatDescr_t descr=0;
218 status = cusparseCreateCsr(&descr, rows, columns, used_elements, (void*)row_ptr, (void*)col_ind, (void*)val, it, it, CUSPARSE_INDEX_BASE_ZERO, dt);
219 if (status != CUSPARSE_STATUS_SUCCESS)
220 throw InternalError(__func__, __FILE__, __LINE__, "cuda error: " + stringify(cusparseGetErrorString(status)));
221
222 cusparseDnVecDescr_t dx;
223 status = cusparseCreateDnVec(&dx, (transposed?rows:columns), (void*)x, dt);
224 if (status != CUSPARSE_STATUS_SUCCESS)
225 throw InternalError(__func__, __FILE__, __LINE__, "cuda error: " + stringify(cusparseGetErrorString(status)));
226
227 cusparseDnVecDescr_t dr;
228 status = cusparseCreateDnVec(&dr, (transposed?columns:rows), (void*)r, dt);
229 if (status != CUSPARSE_STATUS_SUCCESS)
230 throw InternalError(__func__, __FILE__, __LINE__, "cuda error: " + stringify(cusparseGetErrorString(status)));
231
232
233 if (r != y)
234 {
235 MemoryPool::copy(r, y, (transposed?columns:rows));
236 }
237
238 size_t buffer_size(0);
239 status = cusparseSpMV_bufferSize(Util::Intern::cusparse_handle, trans, &a, descr, dx, &b, dr, ct, CUSPARSE_SPMV_CSR_ALG1, &buffer_size);
240 if (status != CUSPARSE_STATUS_SUCCESS)
241 throw InternalError(__func__, __FILE__, __LINE__, "cusparseSpMV_bufferSize failed with status code: " + stringify(cusparseGetErrorString(status)));
242
243 void* buffer = Util::cuda_get_static_memory(buffer_size);
244
245 status = cusparseSpMV(Util::Intern::cusparse_handle, trans, &a, descr, dx, &b, dr, ct, CUSPARSE_SPMV_CSR_ALG1, buffer);
246 if (status != CUSPARSE_STATUS_SUCCESS)
247 throw InternalError(__func__, __FILE__, __LINE__, "cusparseSpMV failed with status code: " + stringify(cusparseGetErrorString(status)));
248
249 cusparseDestroySpMat(descr);
250 cusparseDestroyDnVec(dx);
251 cusparseDestroyDnVec(dr);
252
253 cudaDeviceSynchronize();
254#ifdef FEAT_DEBUG_MODE
255 cudaError_t last_error(cudaGetLastError());
256 if (cudaSuccess != last_error)
257 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
258#endif
259}
260template void Apply::csr_cuda(float *, const float, const float * const, const float, const float * const, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index, const bool);
261template void Apply::csr_cuda(double *, const double, const double * const, const double, const double * const, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index, const bool);
262#ifdef FEAT_HAVE_HALFMATH
263template void Apply::csr_cuda(Half *, const Half, const Half * const, const Half, const Half * const, const Half * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index, const bool);
264#endif
265
266template void Apply::csr_cuda(float *, const float, const float * const, const float, const float * const, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index, const bool);
267template void Apply::csr_cuda(double *, const double, const double * const, const double, const double * const, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index, const bool);
268#ifdef FEAT_HAVE_HALFMATH
269template void Apply::csr_cuda(Half *, const Half, const Half * const, const Half, const Half * const, const Half * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index, const bool);
270#endif
271
272//silence the compiler by pretending to accept any IT_ but hopefully only 'std::uint32_t' calls will be made
273//this circumnavigates the missing static_if in bcsr_wrapper
274template <typename DT_, typename IT_>
275void Apply::bcsr_intern_cuda(DT_ * r, const DT_ a, const DT_ * const x, const DT_ b, const DT_ * const y, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows, const Index columns, const Index used_elements, const int BlockSize)
276{
277 if (r != y)
278 {
279 Util::cuda_copy_device_to_device(r, y, rows * BlockSize * sizeof(DT_));
280 }
281
282 cusparseMatDescr_t descr=0;
283 cusparseCreateMatDescr(&descr);
284 cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
285 cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
286
287 FEAT::LAFEM::Intern::cusparse_apply_bcsr(CUSPARSE_DIRECTION_ROW, CUSPARSE_OPERATION_NON_TRANSPOSE, (int)rows, (int)columns, (int)used_elements, &a, descr, val, (int*)row_ptr, (int*)col_ind,
288 BlockSize, x, &b, r);
289
290 cusparseDestroyMatDescr(descr);
291
292 cudaDeviceSynchronize();
293#ifdef FEAT_DEBUG_MODE
294 cudaError_t last_error(cudaGetLastError());
295 if (cudaSuccess != last_error)
296 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
297#endif
298}
299
300template <typename DT_, typename IT_>
301void Apply::bcsr_intern_cuda(DT_ * r, const DT_ a, const DT_ * const x, const DT_ b, const DT_ * const y, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows, const Index /*columns*/, const Index /*used_elements*/, const int BlockHeight, const int BlockWidth)
302{
303 Index blocksize = Util::cuda_blocksize_spmv;
304 dim3 grid;
305 dim3 block;
306 block.x = (unsigned)blocksize;
307 grid.x = (unsigned)ceil((rows)/(double)(block.x));
308
309 if (Math::abs(b) < Math::eps<DT_>())
310 {
311 MemoryPool::set_memory(r, DT_(0), /*(transposed?columns:rows)*/ rows * BlockHeight);
312 }
313 else if (r != y)
314 {
315 MemoryPool::copy(r, y, /*(transposed?columns:rows)*/ rows * BlockHeight);
316 }
317
318 FEAT::LAFEM::Intern::cuda_apply_bcsr<DT_, IT_><<<grid, block>>>(r, a, x, b, val, col_ind, row_ptr, rows, BlockHeight, BlockWidth);
319
320 cudaDeviceSynchronize();
321#ifdef FEAT_DEBUG_MODE
322 cudaError_t last_error(cudaGetLastError());
323 if (cudaSuccess != last_error)
324 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
325#endif
326}
327
328template <typename DT_, typename IT_>
329void Apply::bcsr_wrapper_cuda(DT_ * r, const DT_ a, const DT_ * const x, const DT_ b, const DT_ * const y, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows, const Index columns, const Index used_elements, const int BlockHeight, const int BlockWidth)
330{
331 //CUSPARSE
332 if ((BlockHeight == BlockWidth) && (sizeof(IT_) == 4u))
333 {
334 // CUSPARSE BCSR kernel only supports block sizes > 1; call scalar CSR in this case instead
335 if(BlockHeight > 1)
336 bcsr_intern_cuda<DT_, IT_>(r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements, BlockHeight);
337 else
338 csr_cuda<DT_, IT_>(r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements, false);
339 }
340 //GENERIC
341 else
342 {
343 bcsr_intern_cuda<DT_, IT_>(r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements, BlockHeight, BlockWidth);
344 }
345}
346template void Apply::bcsr_wrapper_cuda<float, std::uint32_t>(float *, const float, const float * const, const float, const float * const, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index, const int, const int);
347template void Apply::bcsr_wrapper_cuda<double, std::uint32_t>(double *, const double, const double * const, const double, const double * const, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index, const int, const int);
348template void Apply::bcsr_wrapper_cuda<float, std::uint64_t>(float *, const float, const float * const, const float, const float * const, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index, const int, const int);
349template void Apply::bcsr_wrapper_cuda<double, std::uint64_t>(double *, const double, const double * const, const double, const double * const, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index, const int, const int);
350
351template <int BlockSize_, typename DT_, typename IT_>
352void Apply::csrsb_cuda(DT_ * r, const DT_ a, const DT_ * const x, const DT_ b, const DT_ * const y, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows,
353 const Index /*columns*/, const Index /*used_elements*/)
354{
355 Index blocksize = Util::cuda_blocksize_spmv;
356 dim3 grid;
357 dim3 block;
358 block.x = (unsigned)blocksize;
359 grid.x = (unsigned)ceil((rows)/(double)(block.x));
360
361 if (Math::abs(b) < Math::eps<DT_>())
362 {
363 MemoryPool::set_memory(r, DT_(0), /*(transposed?columns:rows)*/ rows * BlockSize_);
364 }
365 else if (r != y)
366 {
367 MemoryPool::copy(r, y, /*(transposed?columns:rows)*/ rows * BlockSize_);
368 }
369
370 FEAT::LAFEM::Intern::cuda_apply_csrsb<BlockSize_, DT_, IT_><<<grid, block>>>(r, a, x, b, val, col_ind, row_ptr, rows);
371
372 cudaDeviceSynchronize();
373#ifdef FEAT_DEBUG_MODE
374 cudaError_t last_error(cudaGetLastError());
375 if (cudaSuccess != last_error)
376 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
377#endif
378}
379template void Apply::csrsb_cuda<1, float, std::uint64_t>
380 (float *, const float, const float *, const float, const float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index);
381template void Apply::csrsb_cuda<1, double, std::uint64_t>
382 (double *, const double, const double *, const double, const double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index);
383template void Apply::csrsb_cuda<1, float, std::uint32_t>
384 (float *, const float, const float *, const float, const float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index);
385template void Apply::csrsb_cuda<1, double, std::uint32_t>
386 (double *, const double, const double *, const double, const double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index);
387template void Apply::csrsb_cuda<2, float, std::uint64_t>
388 (float *, const float, const float *, const float, const float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index);
389template void Apply::csrsb_cuda<2, double, std::uint64_t>
390 (double *, const double, const double *, const double, const double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index);
391template void Apply::csrsb_cuda<2, float, std::uint32_t>
392 (float *, const float, const float *, const float, const float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index);
393template void Apply::csrsb_cuda<2, double, std::uint32_t>
394 (double *, const double, const double *, const double, const double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index);
395template void Apply::csrsb_cuda<3, float, std::uint64_t>
396 (float *, const float, const float *, const float, const float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index);
397template void Apply::csrsb_cuda<3, double, std::uint64_t>
398 (double *, const double, const double *, const double, const double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index);
399template void Apply::csrsb_cuda<3, float, std::uint32_t>
400 (float *, const float, const float *, const float, const float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index);
401template void Apply::csrsb_cuda<3, double, std::uint32_t>
402 (double *, const double, const double *, const double, const double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index);
403
404template <typename DT_, typename IT_>
405void Apply::banded_cuda(DT_ * r, const DT_ alpha, const DT_ * const x, const DT_ beta, const DT_ * const y, const DT_ * const val, const IT_ * const offsets, const Index num_of_offsets, const Index rows, const Index columns)
406{
407 Index blocksize = Util::cuda_blocksize_spmv;
408 dim3 grid;
409 dim3 block;
410 block.x = (unsigned)blocksize;
411 grid.x = (unsigned)ceil((rows)/(double)(block.x));
412
413 if (Math::abs(beta) < Math::eps<DT_>())
414 {
415 MemoryPool::set_memory(r, DT_(0), rows);
416 }
417 else if (r != y)
418 {
419 MemoryPool::copy(r, y, rows);
420 }
421
422 FEAT::LAFEM::Intern::cuda_apply_banded<<<grid, block>>>(r, alpha, x, beta, val, offsets, num_of_offsets, rows, columns);
423}
424template void Apply::banded_cuda(float *, const float, const float * const, const float, const float * const, const float * const, const std::uint32_t * const, const Index, const Index, const Index);
425template void Apply::banded_cuda(double *, const double, const double * const, const double, const double * const, const double * const, const std::uint32_t * const, const Index, const Index, const Index);
426template void Apply::banded_cuda(float *, const float, const float * const, const float, const float * const, const float * const, const std::uint64_t * const, const Index, const Index, const Index);
427template void Apply::banded_cuda(double *, const double, const double * const, const double, const double * const, const double * const, const std::uint64_t * const, const Index, const Index, const Index);
428
429template <typename DT_>
430void Apply::dense_cuda(DT_ * r, const DT_ alpha, const DT_ beta, const DT_ * const y, const DT_ * const val, const DT_ * const x, const Index rows, const Index columns)
431{
432 if (r != y)
433 {
434 Util::cuda_copy_device_to_device(r, y, rows * sizeof(DT_));
435 }
436
437 FEAT::LAFEM::Intern::cublas_apply_dense(CUBLAS_OP_T, (int)rows, (int)columns, &alpha, val, x, &beta, r);
438
439 cudaDeviceSynchronize();
440#ifdef FEAT_DEBUG_MODE
441 cudaError_t last_error(cudaGetLastError());
442 if (cudaSuccess != last_error)
443 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
444#endif
445}
446template void Apply::dense_cuda(float * r, const float, const float, const float * const, const float * const, const float * const, const Index, const Index);
447template void Apply::dense_cuda(double * r, const double, const double, const double * const, const double * const, const double * const, const Index, const Index);