FEAT 3
Finite Element Analysis Toolbox
Loading...
Searching...
No Matches
apply.cu
1// FEAT3: Finite Element Analysis Toolbox, Version 3
2// Copyright (C) 2010 by Stefan Turek & the FEAT group
3// FEAT3 is released under the GNU General Public License version 3,
4// see the file 'copyright.txt' in the top level directory for details.
5
6// includes, FEAT
7#include <kernel/base_header.hpp>
8#include <kernel/lafem/arch/apply.hpp>
9#include <kernel/lafem/arch/component_product.hpp>
10#include <kernel/lafem/arch/scale.hpp>
11#include <kernel/util/exception.hpp>
12#include <kernel/util/memory_pool.hpp>
13#include <kernel/util/math.hpp>
14#include <kernel/util/half.hpp>
15
16#include "cusparse_v2.h"
17
18namespace FEAT
19{
20 namespace LAFEM
21 {
22 namespace Intern
23 {
24 template <typename DT_, typename IT_>
25 __global__ void cuda_apply_banded(DT_ * r, const DT_ alpha, const DT_ * x, const DT_ beta, const DT_ * val, const IT_ * offsets, const Index num_of_offsets, const Index rows, const Index columns)
26 {
27 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
28 if (idx >= rows)
29 return;
30
31 const Index k1(rows - 1);
32 const Index k2(rows + columns - 1);
33
34 Index start(0);
35
36 while (k1 > offsets[start] + idx)
37 {
38 ++start;
39 }
40
41 Index end(start);
42
43 while (end < num_of_offsets && idx + offsets[end] < k2)
44 {
45 ++end;
46 }
47
48 DT_ sum(DT_(0.0));
49 for (Index diag(start); diag < end; ++diag)
50 {
51 sum += val[rows * diag + idx] * x[idx + offsets[diag] - rows + 1];
52 }
53 r[idx] = (sum*alpha) + beta * r[idx];
54 }
55
56 void cusparse_apply_bcsr(cusparseDirection_t dir, cusparseOperation_t trans,
57 int m, int n, int nnz,
58 const float * alpha, const cusparseMatDescr_t descrA,
59 const float * csrVal, const int * csrRowPtr, const int *csrColInd,
60 int block_dim,
61 const float * x, const float * beta, float * y)
62 {
63 cusparseStatus_t status;
64 status = cusparseSbsrmv(Util::Intern::cusparse_handle, dir, trans, m, n, nnz, alpha, descrA, csrVal, csrRowPtr,
65 csrColInd, block_dim, x, beta, y);
66 if (status != CUSPARSE_STATUS_SUCCESS)
67 throw InternalError(__func__, __FILE__, __LINE__, "cusparsebsrmv failed with status code: " + stringify(status));
68 }
69
70 void cusparse_apply_bcsr(cusparseDirection_t dir, cusparseOperation_t trans,
71 int m, int n, int nnz,
72 const double * alpha, const cusparseMatDescr_t descrA,
73 const double * csrVal, const int * csrRowPtr, const int *csrColInd,
74 int block_dim,
75 const double * x, const double * beta, double * y)
76 {
77 cusparseStatus_t status;
78 status = cusparseDbsrmv(Util::Intern::cusparse_handle, dir, trans, m, n, nnz, alpha, descrA, csrVal, csrRowPtr,
79 csrColInd, block_dim, x, beta, y);
80 if (status != CUSPARSE_STATUS_SUCCESS)
81 throw InternalError(__func__, __FILE__, __LINE__, "cusparsebsrmv failed with status code: " + stringify(status));
82 }
83
84 void cublas_apply_dense(cublasOperation_t trans,
85 int m, int n,
86 const float * alpha,
87 const float * val,
88 const float * x, const float * beta, float * y)
89 {
90 cublasStatus_t status;
91 status = cublasSgemv(Util::Intern::cublas_handle, trans, n, m, alpha, val, n, x, 1, beta, y, 1);
92 if (status != CUBLAS_STATUS_SUCCESS)
93 throw InternalError(__func__, __FILE__, __LINE__, "cuda error: " + stringify(cublasGetStatusString(status)));
94 }
95
96 void cublas_apply_dense(cublasOperation_t trans,
97 int m, int n,
98 const double * alpha,
99 const double * val,
100 const double * x, const double * beta, double * y)
101 {
102 cublasStatus_t status;
103 status = cublasDgemv(Util::Intern::cublas_handle, trans, n, m, alpha, val, n, x, 1, beta, y, 1);
104 if (status != CUBLAS_STATUS_SUCCESS)
105 throw InternalError(__func__, __FILE__, __LINE__, "cuda error: " + stringify(cublasGetStatusString(status)));
106 }
107
108 template <int BlockSize_, typename DT_, typename IT_>
109 __global__ void cuda_apply_csrsb(DT_ * r, const DT_ a, const DT_ * x, const DT_ b, const DT_ * val, const IT_ * col_ind,
110 const IT_ * row_ptr, const Index count)
111 {
112 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
113 if (idx >= count)
114 return;
115
116 DT_ bsum[BlockSize_];
117 for (int j(0) ; j < BlockSize_ ; ++j)
118 {
119 bsum[j] = DT_(0);
120 }
121 const IT_ end(row_ptr[idx + 1]);
122 for (IT_ i(row_ptr[idx]) ; i < end ; ++i)
123 {
124 const DT_ vali(val[i]);
125 const IT_ coli(col_ind[i] * BlockSize_);
126 for (int j(0) ; j < BlockSize_ ; ++j)
127 {
128 bsum[j] += vali * x[coli + j];
129 }
130 }
131 for (int j(0) ; j < BlockSize_ ; ++j)
132 {
133 r[idx * BlockSize_ + j] = (bsum[j] * a) + b * r[idx * BlockSize_ + j];
134 }
135 }
136
137 template <typename DT_, typename IT_>
138 __global__ void cuda_apply_bcsr(DT_ * r, const DT_ a, const DT_ * x, const DT_ b, const DT_ * val, const IT_ * col_ind,
139 const IT_ * row_ptr, const Index count, const int BlockHeight, const int BlockWidth)
140 {
141 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
142 if (idx >= count)
143 return;
144
145 /// \todo remove hardcoded number
146 DT_ bsum[10];
147 for (int j(0) ; j < BlockHeight ; ++j)
148 {
149 bsum[j] = DT_(0);
150 }
151 const IT_ end(row_ptr[idx + 1]);
152 for (IT_ i(row_ptr[idx]) ; i < end ; ++i)
153 {
154 for (int h(0) ; h < BlockHeight ; ++h)
155 {
156 for (int w(0) ; w < BlockWidth ; ++w)
157 {
158 bsum[h] += val[i * BlockHeight * BlockWidth + h * BlockWidth + w] * x[col_ind[i] * BlockWidth + w];
159 }
160 }
161 }
162 for (int j(0) ; j < BlockHeight ; ++j)
163 {
164 r[idx * BlockHeight + j] = (bsum[j] * a) + b * r[idx * BlockHeight + j];
165 }
166 }
167 }
168 }
169}
170
171
172using namespace FEAT;
173using namespace FEAT::LAFEM;
174using namespace FEAT::LAFEM::Arch;
175
176template <typename DT_, typename IT_>
177void Apply::csr_cuda(DT_ * r, const DT_ a, const DT_ * const x, const DT_ b, const DT_ * const y, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows, const Index columns, const Index used_elements, const bool transposed)
178{
179 cusparseOperation_t trans;
180 if (transposed)
181 trans = CUSPARSE_OPERATION_TRANSPOSE;
182 else
183 trans = CUSPARSE_OPERATION_NON_TRANSPOSE;
184
185 cudaDataType dt;
186 cudaDataType ct; //compute type
187 if (typeid(DT_) == typeid(double))
188 {
189 dt = CUDA_R_64F;
190 ct = CUDA_R_64F;
191 }
192 else if (typeid(DT_) == typeid(float))
193 {
194 dt = CUDA_R_32F;
195 ct = CUDA_R_32F;
196 }
197#ifdef FEAT_HAVE_HALFMATH
198 else if (typeid(DT_) == typeid(Half))
199 {
200 dt = CUDA_R_16F;
201 ct = CUDA_R_32F; //cusparseSpMV does not support computation in half, yet
202 }
203#endif
204 else
205 throw InternalError(__func__, __FILE__, __LINE__, "unsupported data type!");
206
207 cusparseIndexType_t it;
208 if(sizeof(IT_) == 4u)
209 it = CUSPARSE_INDEX_32I;
210 else if(sizeof(IT_) == 8u)
211 it = CUSPARSE_INDEX_64I;
212 else
213 throw InternalError(__func__, __FILE__, __LINE__, "unsupported index type!");
214
215 cusparseStatus_t status;
216
217 cusparseSpMatDescr_t descr=0;
218 status = cusparseCreateCsr(&descr, rows, columns, used_elements, (void*)row_ptr, (void*)col_ind, (void*)val, it, it, CUSPARSE_INDEX_BASE_ZERO, dt);
219 if (status != CUSPARSE_STATUS_SUCCESS)
220 throw InternalError(__func__, __FILE__, __LINE__, "cuda error: " + stringify(cusparseGetErrorString(status)));
221
222 cusparseDnVecDescr_t dx;
223 status = cusparseCreateDnVec(&dx, (transposed?rows:columns), (void*)x, dt);
224 if (status != CUSPARSE_STATUS_SUCCESS)
225 throw InternalError(__func__, __FILE__, __LINE__, "cuda error: " + stringify(cusparseGetErrorString(status)));
226
227 cusparseDnVecDescr_t dr;
228 status = cusparseCreateDnVec(&dr, (transposed?columns:rows), (void*)r, dt);
229 if (status != CUSPARSE_STATUS_SUCCESS)
230 throw InternalError(__func__, __FILE__, __LINE__, "cuda error: " + stringify(cusparseGetErrorString(status)));
231
232
233 if (r != y)
234 {
235 MemoryPool::copy(r, y, (transposed?columns:rows));
236 }
237
238 float alpha_tmp = a;
239 float beta_tmp = b;
240
241 // Due to cuda requirements alpha and beta need to be float if this function is called with Half
242 void* const alpha_ptr = dt == CUDA_R_16F ? (void*)&alpha_tmp : (void*)&a;
243 void* const beta_ptr = dt == CUDA_R_16F ? (void*)&beta_tmp : (void*)&b;
244
245 size_t buffer_size(0);
246 status = cusparseSpMV_bufferSize(Util::Intern::cusparse_handle, trans, alpha_ptr, descr, dx, beta_ptr, dr, ct, CUSPARSE_SPMV_CSR_ALG1, &buffer_size);
247 if (status != CUSPARSE_STATUS_SUCCESS)
248 throw InternalError(__func__, __FILE__, __LINE__, "cusparseSpMV_bufferSize failed with status code: " + stringify(cusparseGetErrorString(status)));
249
250 void* buffer = Util::cuda_get_static_memory(buffer_size);
251
252 status = cusparseSpMV(Util::Intern::cusparse_handle, trans, alpha_ptr, descr, dx, beta_ptr, dr, ct, CUSPARSE_SPMV_CSR_ALG1, buffer);
253 if (status != CUSPARSE_STATUS_SUCCESS)
254 throw InternalError(__func__, __FILE__, __LINE__, "cusparseSpMV failed with status code: " + stringify(cusparseGetErrorString(status)));
255
256 cusparseDestroySpMat(descr);
257 cusparseDestroyDnVec(dx);
258 cusparseDestroyDnVec(dr);
259
260 cudaDeviceSynchronize();
261#ifdef FEAT_DEBUG_MODE
262 cudaError_t last_error(cudaGetLastError());
263 if (cudaSuccess != last_error)
264 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
265#endif
266}
267template void Apply::csr_cuda(float *, const float, const float * const, const float, const float * const, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index, const bool);
268template void Apply::csr_cuda(double *, const double, const double * const, const double, const double * const, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index, const bool);
269#ifdef FEAT_HAVE_HALFMATH
270template void Apply::csr_cuda(Half *, const Half, const Half * const, const Half, const Half * const, const Half * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index, const bool);
271#endif
272
273template void Apply::csr_cuda(float *, const float, const float * const, const float, const float * const, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index, const bool);
274template void Apply::csr_cuda(double *, const double, const double * const, const double, const double * const, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index, const bool);
275#ifdef FEAT_HAVE_HALFMATH
276template void Apply::csr_cuda(Half *, const Half, const Half * const, const Half, const Half * const, const Half * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index, const bool);
277#endif
278
279//silence the compiler by pretending to accept any IT_ but hopefully only 'std::uint32_t' calls will be made
280//this circumnavigates the missing static_if in bcsr_wrapper
281template <typename DT_, typename IT_>
282void Apply::bcsr_intern_cuda(DT_ * r, const DT_ a, const DT_ * const x, const DT_ b, const DT_ * const y, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows, const Index columns, const Index used_elements, const int BlockSize)
283{
284 if (r != y)
285 {
286 Util::cuda_copy_device_to_device(r, y, rows * BlockSize * sizeof(DT_));
287 }
288
289 cusparseMatDescr_t descr=0;
290 cusparseCreateMatDescr(&descr);
291 cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
292 cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
293
294 FEAT::LAFEM::Intern::cusparse_apply_bcsr(CUSPARSE_DIRECTION_ROW, CUSPARSE_OPERATION_NON_TRANSPOSE, (int)rows, (int)columns, (int)used_elements, &a, descr, val, (int*)row_ptr, (int*)col_ind,
295 BlockSize, x, &b, r);
296
297 cusparseDestroyMatDescr(descr);
298
299 cudaDeviceSynchronize();
300#ifdef FEAT_DEBUG_MODE
301 cudaError_t last_error(cudaGetLastError());
302 if (cudaSuccess != last_error)
303 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
304#endif
305}
306
307template <typename DT_, typename IT_>
308void Apply::bcsr_intern_cuda(DT_ * r, const DT_ a, const DT_ * const x, const DT_ b, const DT_ * const y, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows, const Index /*columns*/, const Index /*used_elements*/, const int BlockHeight, const int BlockWidth)
309{
310 Index blocksize = Util::cuda_blocksize_spmv;
311 dim3 grid;
312 dim3 block;
313 block.x = (unsigned)blocksize;
314 grid.x = (unsigned)ceil((rows)/(double)(block.x));
315
316 if (Math::abs(b) < Math::eps<DT_>())
317 {
318 MemoryPool::set_memory(r, DT_(0), /*(transposed?columns:rows)*/ rows * BlockHeight);
319 }
320 else if (r != y)
321 {
322 MemoryPool::copy(r, y, /*(transposed?columns:rows)*/ rows * BlockHeight);
323 }
324
325 FEAT::LAFEM::Intern::cuda_apply_bcsr<DT_, IT_><<<grid, block>>>(r, a, x, b, val, col_ind, row_ptr, rows, BlockHeight, BlockWidth);
326
327 cudaDeviceSynchronize();
328#ifdef FEAT_DEBUG_MODE
329 cudaError_t last_error(cudaGetLastError());
330 if (cudaSuccess != last_error)
331 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
332#endif
333}
334
335template <typename DT_, typename IT_>
336void Apply::bcsr_wrapper_cuda(DT_ * r, const DT_ a, const DT_ * const x, const DT_ b, const DT_ * const y, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows, const Index columns, const Index used_elements, const int BlockHeight, const int BlockWidth)
337{
338 //CUSPARSE
339 if ((BlockHeight == BlockWidth) && (sizeof(IT_) == 4u))
340 {
341 // CUSPARSE BCSR kernel only supports block sizes > 1; call scalar CSR in this case instead
342 if(BlockHeight > 1)
343 bcsr_intern_cuda<DT_, IT_>(r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements, BlockHeight);
344 else
345 csr_cuda<DT_, IT_>(r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements, false);
346 }
347 //GENERIC
348 else
349 {
350 bcsr_intern_cuda<DT_, IT_>(r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements, BlockHeight, BlockWidth);
351 }
352}
353template void Apply::bcsr_wrapper_cuda<float, std::uint32_t>(float *, const float, const float * const, const float, const float * const, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index, const int, const int);
354template void Apply::bcsr_wrapper_cuda<double, std::uint32_t>(double *, const double, const double * const, const double, const double * const, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index, const int, const int);
355template void Apply::bcsr_wrapper_cuda<float, std::uint64_t>(float *, const float, const float * const, const float, const float * const, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index, const int, const int);
356template void Apply::bcsr_wrapper_cuda<double, std::uint64_t>(double *, const double, const double * const, const double, const double * const, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index, const int, const int);
357
358template <int BlockSize_, typename DT_, typename IT_>
359void Apply::csrsb_cuda(DT_ * r, const DT_ a, const DT_ * const x, const DT_ b, const DT_ * const y, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows,
360 const Index /*columns*/, const Index /*used_elements*/)
361{
362 Index blocksize = Util::cuda_blocksize_spmv;
363 dim3 grid;
364 dim3 block;
365 block.x = (unsigned)blocksize;
366 grid.x = (unsigned)ceil((rows)/(double)(block.x));
367
368 if (Math::abs(b) < Math::eps<DT_>())
369 {
370 MemoryPool::set_memory(r, DT_(0), /*(transposed?columns:rows)*/ rows * BlockSize_);
371 }
372 else if (r != y)
373 {
374 MemoryPool::copy(r, y, /*(transposed?columns:rows)*/ rows * BlockSize_);
375 }
376
377 FEAT::LAFEM::Intern::cuda_apply_csrsb<BlockSize_, DT_, IT_><<<grid, block>>>(r, a, x, b, val, col_ind, row_ptr, rows);
378
379 cudaDeviceSynchronize();
380#ifdef FEAT_DEBUG_MODE
381 cudaError_t last_error(cudaGetLastError());
382 if (cudaSuccess != last_error)
383 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
384#endif
385}
386template void Apply::csrsb_cuda<1, float, std::uint64_t>
387 (float *, const float, const float *, const float, const float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index);
388template void Apply::csrsb_cuda<1, double, std::uint64_t>
389 (double *, const double, const double *, const double, const double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index);
390template void Apply::csrsb_cuda<1, float, std::uint32_t>
391 (float *, const float, const float *, const float, const float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index);
392template void Apply::csrsb_cuda<1, double, std::uint32_t>
393 (double *, const double, const double *, const double, const double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index);
394template void Apply::csrsb_cuda<2, float, std::uint64_t>
395 (float *, const float, const float *, const float, const float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index);
396template void Apply::csrsb_cuda<2, double, std::uint64_t>
397 (double *, const double, const double *, const double, const double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index);
398template void Apply::csrsb_cuda<2, float, std::uint32_t>
399 (float *, const float, const float *, const float, const float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index);
400template void Apply::csrsb_cuda<2, double, std::uint32_t>
401 (double *, const double, const double *, const double, const double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index);
402template void Apply::csrsb_cuda<3, float, std::uint64_t>
403 (float *, const float, const float *, const float, const float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index);
404template void Apply::csrsb_cuda<3, double, std::uint64_t>
405 (double *, const double, const double *, const double, const double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index);
406template void Apply::csrsb_cuda<3, float, std::uint32_t>
407 (float *, const float, const float *, const float, const float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index);
408template void Apply::csrsb_cuda<3, double, std::uint32_t>
409 (double *, const double, const double *, const double, const double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index);
410
411template <typename DT_, typename IT_>
412void Apply::banded_cuda(DT_ * r, const DT_ alpha, const DT_ * const x, const DT_ beta, const DT_ * const y, const DT_ * const val, const IT_ * const offsets, const Index num_of_offsets, const Index rows, const Index columns)
413{
414 Index blocksize = Util::cuda_blocksize_spmv;
415 dim3 grid;
416 dim3 block;
417 block.x = (unsigned)blocksize;
418 grid.x = (unsigned)ceil((rows)/(double)(block.x));
419
420 if (Math::abs(beta) < Math::eps<DT_>())
421 {
422 MemoryPool::set_memory(r, DT_(0), rows);
423 }
424 else if (r != y)
425 {
426 MemoryPool::copy(r, y, rows);
427 }
428
429 FEAT::LAFEM::Intern::cuda_apply_banded<<<grid, block>>>(r, alpha, x, beta, val, offsets, num_of_offsets, rows, columns);
430}
431template void Apply::banded_cuda(float *, const float, const float * const, const float, const float * const, const float * const, const std::uint32_t * const, const Index, const Index, const Index);
432template void Apply::banded_cuda(double *, const double, const double * const, const double, const double * const, const double * const, const std::uint32_t * const, const Index, const Index, const Index);
433template void Apply::banded_cuda(float *, const float, const float * const, const float, const float * const, const float * const, const std::uint64_t * const, const Index, const Index, const Index);
434template void Apply::banded_cuda(double *, const double, const double * const, const double, const double * const, const double * const, const std::uint64_t * const, const Index, const Index, const Index);
435
436template <typename DT_>
437void Apply::dense_cuda(DT_ * r, const DT_ alpha, const DT_ beta, const DT_ * const y, const DT_ * const val, const DT_ * const x, const Index rows, const Index columns)
438{
439 if (r != y)
440 {
441 Util::cuda_copy_device_to_device(r, y, rows * sizeof(DT_));
442 }
443
444 FEAT::LAFEM::Intern::cublas_apply_dense(CUBLAS_OP_T, (int)rows, (int)columns, &alpha, val, x, &beta, r);
445
446 cudaDeviceSynchronize();
447#ifdef FEAT_DEBUG_MODE
448 cudaError_t last_error(cudaGetLastError());
449 if (cudaSuccess != last_error)
450 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
451#endif
452}
453template void Apply::dense_cuda(float * r, const float, const float, const float * const, const float * const, const float * const, const Index, const Index);
454template void Apply::dense_cuda(double * r, const double, const double, const double * const, const double * const, const double * const, const Index, const Index);