1// FEAT3: Finite Element Analysis Toolbox, Version 3
2// Copyright (C) 2010 by Stefan Turek & the FEAT group
3// FEAT3 is released under the GNU General Public License version 3,
4// see the file 'copyright.txt' in the top level directory for details.
7#include <kernel/base_header.hpp>
8#include <kernel/lafem/arch/apply.hpp>
9#include <kernel/lafem/arch/component_product.hpp>
10#include <kernel/lafem/arch/scale.hpp>
11#include <kernel/util/exception.hpp>
12#include <kernel/util/memory_pool.hpp>
13#include <kernel/util/math.hpp>
14#include <kernel/util/half.hpp>
16#include "cusparse_v2.h"
24 template <typename DT_, typename IT_>
25 __global__ void cuda_apply_banded(DT_ * r, const DT_ alpha, const DT_ * x, const DT_ beta, const DT_ * val, const IT_ * offsets, const Index num_of_offsets, const Index rows, const Index columns)
27 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
31 const Index k1(rows - 1);
32 const Index k2(rows + columns - 1);
36 while (k1 > offsets[start] + idx)
43 while (end < num_of_offsets && idx + offsets[end] < k2)
49 for (Index diag(start); diag < end; ++diag)
51 sum += val[rows * diag + idx] * x[idx + offsets[diag] - rows + 1];
53 r[idx] = (sum*alpha) + beta * r[idx];
56 void cusparse_apply_bcsr(cusparseDirection_t dir, cusparseOperation_t trans,
57 int m, int n, int nnz,
58 const float * alpha, const cusparseMatDescr_t descrA,
59 const float * csrVal, const int * csrRowPtr, const int *csrColInd,
61 const float * x, const float * beta, float * y)
63 cusparseStatus_t status;
64 status = cusparseSbsrmv(Util::Intern::cusparse_handle, dir, trans, m, n, nnz, alpha, descrA, csrVal, csrRowPtr,
65 csrColInd, block_dim, x, beta, y);
66 if (status != CUSPARSE_STATUS_SUCCESS)
67 throw InternalError(__func__, __FILE__, __LINE__, "cusparsebsrmv failed with status code: " + stringify(status));
70 void cusparse_apply_bcsr(cusparseDirection_t dir, cusparseOperation_t trans,
71 int m, int n, int nnz,
72 const double * alpha, const cusparseMatDescr_t descrA,
73 const double * csrVal, const int * csrRowPtr, const int *csrColInd,
75 const double * x, const double * beta, double * y)
77 cusparseStatus_t status;
78 status = cusparseDbsrmv(Util::Intern::cusparse_handle, dir, trans, m, n, nnz, alpha, descrA, csrVal, csrRowPtr,
79 csrColInd, block_dim, x, beta, y);
80 if (status != CUSPARSE_STATUS_SUCCESS)
81 throw InternalError(__func__, __FILE__, __LINE__, "cusparsebsrmv failed with status code: " + stringify(status));
84 void cublas_apply_dense(cublasOperation_t trans,
88 const float * x, const float * beta, float * y)
90 cublasStatus_t status;
91 status = cublasSgemv(Util::Intern::cublas_handle, trans, n, m, alpha, val, n, x, 1, beta, y, 1);
92 if (status != CUBLAS_STATUS_SUCCESS)
93 throw InternalError(__func__, __FILE__, __LINE__, "cuda error: " + stringify(cublasGetStatusString(status)));
96 void cublas_apply_dense(cublasOperation_t trans,
100 const double * x, const double * beta, double * y)
102 cublasStatus_t status;
103 status = cublasDgemv(Util::Intern::cublas_handle, trans, n, m, alpha, val, n, x, 1, beta, y, 1);
104 if (status != CUBLAS_STATUS_SUCCESS)
105 throw InternalError(__func__, __FILE__, __LINE__, "cuda error: " + stringify(cublasGetStatusString(status)));
108 template <int BlockSize_, typename DT_, typename IT_>
109 __global__ void cuda_apply_csrsb(DT_ * r, const DT_ a, const DT_ * x, const DT_ b, const DT_ * val, const IT_ * col_ind,
110 const IT_ * row_ptr, const Index count)
112 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
116 DT_ bsum[BlockSize_];
117 for (int j(0) ; j < BlockSize_ ; ++j)
121 const IT_ end(row_ptr[idx + 1]);
122 for (IT_ i(row_ptr[idx]) ; i < end ; ++i)
124 const DT_ vali(val[i]);
125 const IT_ coli(col_ind[i] * BlockSize_);
126 for (int j(0) ; j < BlockSize_ ; ++j)
128 bsum[j] += vali * x[coli + j];
131 for (int j(0) ; j < BlockSize_ ; ++j)
133 r[idx * BlockSize_ + j] = (bsum[j] * a) + b * r[idx * BlockSize_ + j];
137 template <typename DT_, typename IT_>
138 __global__ void cuda_apply_bcsr(DT_ * r, const DT_ a, const DT_ * x, const DT_ b, const DT_ * val, const IT_ * col_ind,
139 const IT_ * row_ptr, const Index count, const int BlockHeight, const int BlockWidth)
141 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
145 /// \todo remove hardcoded number
147 for (int j(0) ; j < BlockHeight ; ++j)
151 const IT_ end(row_ptr[idx + 1]);
152 for (IT_ i(row_ptr[idx]) ; i < end ; ++i)
154 for (int h(0) ; h < BlockHeight ; ++h)
156 for (int w(0) ; w < BlockWidth ; ++w)
158 bsum[h] += val[i * BlockHeight * BlockWidth + h * BlockWidth + w] * x[col_ind[i] * BlockWidth + w];
162 for (int j(0) ; j < BlockHeight ; ++j)
164 r[idx * BlockHeight + j] = (bsum[j] * a) + b * r[idx * BlockHeight + j];
173using namespace FEAT::LAFEM;
174using namespace FEAT::LAFEM::Arch;
176template <typename DT_, typename IT_>
177void Apply::csr_cuda(DT_ * r, const DT_ a, const DT_ * const x, const DT_ b, const DT_ * const y, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows, const Index columns, const Index used_elements, const bool transposed)
179 cusparseOperation_t trans;
181 trans = CUSPARSE_OPERATION_TRANSPOSE;
183 trans = CUSPARSE_OPERATION_NON_TRANSPOSE;
186 cudaDataType ct; //compute type
187 if (typeid(DT_) == typeid(double))
192 else if (typeid(DT_) == typeid(float))
197#ifdef FEAT_HAVE_HALFMATH
198 else if (typeid(DT_) == typeid(Half))
201 ct = CUDA_R_32F; //cusparseSpMV does not support computation in half, yet
205 throw InternalError(__func__, __FILE__, __LINE__, "unsupported data type!");
207 cusparseIndexType_t it;
208 if(sizeof(IT_) == 4u)
209 it = CUSPARSE_INDEX_32I;
210 else if(sizeof(IT_) == 8u)
211 it = CUSPARSE_INDEX_64I;
213 throw InternalError(__func__, __FILE__, __LINE__, "unsupported index type!");
215 cusparseStatus_t status;
217 cusparseSpMatDescr_t descr=0;
218 status = cusparseCreateCsr(&descr, rows, columns, used_elements, (void*)row_ptr, (void*)col_ind, (void*)val, it, it, CUSPARSE_INDEX_BASE_ZERO, dt);
219 if (status != CUSPARSE_STATUS_SUCCESS)
220 throw InternalError(__func__, __FILE__, __LINE__, "cuda error: " + stringify(cusparseGetErrorString(status)));
222 cusparseDnVecDescr_t dx;
223 status = cusparseCreateDnVec(&dx, (transposed?rows:columns), (void*)x, dt);
224 if (status != CUSPARSE_STATUS_SUCCESS)
225 throw InternalError(__func__, __FILE__, __LINE__, "cuda error: " + stringify(cusparseGetErrorString(status)));
227 cusparseDnVecDescr_t dr;
228 status = cusparseCreateDnVec(&dr, (transposed?columns:rows), (void*)r, dt);
229 if (status != CUSPARSE_STATUS_SUCCESS)
230 throw InternalError(__func__, __FILE__, __LINE__, "cuda error: " + stringify(cusparseGetErrorString(status)));
235 MemoryPool::copy(r, y, (transposed?columns:rows));
241 // Due to cuda requirements alpha and beta need to be float if this function is called with Half
242 void* const alpha_ptr = dt == CUDA_R_16F ? (void*)&alpha_tmp : (void*)&a;
243 void* const beta_ptr = dt == CUDA_R_16F ? (void*)&beta_tmp : (void*)&b;
245 size_t buffer_size(0);
246 status = cusparseSpMV_bufferSize(Util::Intern::cusparse_handle, trans, alpha_ptr, descr, dx, beta_ptr, dr, ct, CUSPARSE_SPMV_CSR_ALG1, &buffer_size);
247 if (status != CUSPARSE_STATUS_SUCCESS)
248 throw InternalError(__func__, __FILE__, __LINE__, "cusparseSpMV_bufferSize failed with status code: " + stringify(cusparseGetErrorString(status)));
250 void* buffer = Util::cuda_get_static_memory(buffer_size);
252 status = cusparseSpMV(Util::Intern::cusparse_handle, trans, alpha_ptr, descr, dx, beta_ptr, dr, ct, CUSPARSE_SPMV_CSR_ALG1, buffer);
253 if (status != CUSPARSE_STATUS_SUCCESS)
254 throw InternalError(__func__, __FILE__, __LINE__, "cusparseSpMV failed with status code: " + stringify(cusparseGetErrorString(status)));
256 cusparseDestroySpMat(descr);
257 cusparseDestroyDnVec(dx);
258 cusparseDestroyDnVec(dr);
260 cudaDeviceSynchronize();
261#ifdef FEAT_DEBUG_MODE
262 cudaError_t last_error(cudaGetLastError());
263 if (cudaSuccess != last_error)
264 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
267template void Apply::csr_cuda(float *, const float, const float * const, const float, const float * const, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index, const bool);
268template void Apply::csr_cuda(double *, const double, const double * const, const double, const double * const, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index, const bool);
269#ifdef FEAT_HAVE_HALFMATH
270template void Apply::csr_cuda(Half *, const Half, const Half * const, const Half, const Half * const, const Half * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index, const bool);
273template void Apply::csr_cuda(float *, const float, const float * const, const float, const float * const, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index, const bool);
274template void Apply::csr_cuda(double *, const double, const double * const, const double, const double * const, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index, const bool);
275#ifdef FEAT_HAVE_HALFMATH
276template void Apply::csr_cuda(Half *, const Half, const Half * const, const Half, const Half * const, const Half * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index, const bool);
279//silence the compiler by pretending to accept any IT_ but hopefully only 'std::uint32_t' calls will be made
280//this circumnavigates the missing static_if in bcsr_wrapper
281template <typename DT_, typename IT_>
282void Apply::bcsr_intern_cuda(DT_ * r, const DT_ a, const DT_ * const x, const DT_ b, const DT_ * const y, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows, const Index columns, const Index used_elements, const int BlockSize)
286 Util::cuda_copy_device_to_device(r, y, rows * BlockSize * sizeof(DT_));
289 cusparseMatDescr_t descr=0;
290 cusparseCreateMatDescr(&descr);
291 cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
292 cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
294 FEAT::LAFEM::Intern::cusparse_apply_bcsr(CUSPARSE_DIRECTION_ROW, CUSPARSE_OPERATION_NON_TRANSPOSE, (int)rows, (int)columns, (int)used_elements, &a, descr, val, (int*)row_ptr, (int*)col_ind,
295 BlockSize, x, &b, r);
297 cusparseDestroyMatDescr(descr);
299 cudaDeviceSynchronize();
300#ifdef FEAT_DEBUG_MODE
301 cudaError_t last_error(cudaGetLastError());
302 if (cudaSuccess != last_error)
303 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
307template <typename DT_, typename IT_>
308void Apply::bcsr_intern_cuda(DT_ * r, const DT_ a, const DT_ * const x, const DT_ b, const DT_ * const y, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows, const Index /*columns*/, const Index /*used_elements*/, const int BlockHeight, const int BlockWidth)
310 Index blocksize = Util::cuda_blocksize_spmv;
313 block.x = (unsigned)blocksize;
314 grid.x = (unsigned)ceil((rows)/(double)(block.x));
316 if (Math::abs(b) < Math::eps<DT_>())
318 MemoryPool::set_memory(r, DT_(0), /*(transposed?columns:rows)*/ rows * BlockHeight);
322 MemoryPool::copy(r, y, /*(transposed?columns:rows)*/ rows * BlockHeight);
325 FEAT::LAFEM::Intern::cuda_apply_bcsr<DT_, IT_><<<grid, block>>>(r, a, x, b, val, col_ind, row_ptr, rows, BlockHeight, BlockWidth);
327 cudaDeviceSynchronize();
328#ifdef FEAT_DEBUG_MODE
329 cudaError_t last_error(cudaGetLastError());
330 if (cudaSuccess != last_error)
331 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
335template <typename DT_, typename IT_>
336void Apply::bcsr_wrapper_cuda(DT_ * r, const DT_ a, const DT_ * const x, const DT_ b, const DT_ * const y, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows, const Index columns, const Index used_elements, const int BlockHeight, const int BlockWidth)
339 if ((BlockHeight == BlockWidth) && (sizeof(IT_) == 4u))
341 // CUSPARSE BCSR kernel only supports block sizes > 1; call scalar CSR in this case instead
343 bcsr_intern_cuda<DT_, IT_>(r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements, BlockHeight);
345 csr_cuda<DT_, IT_>(r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements, false);
350 bcsr_intern_cuda<DT_, IT_>(r, a, x, b, y, val, col_ind, row_ptr, rows, columns, used_elements, BlockHeight, BlockWidth);
353template void Apply::bcsr_wrapper_cuda<float, std::uint32_t>(float *, const float, const float * const, const float, const float * const, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index, const int, const int);
354template void Apply::bcsr_wrapper_cuda<double, std::uint32_t>(double *, const double, const double * const, const double, const double * const, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index, const int, const int);
355template void Apply::bcsr_wrapper_cuda<float, std::uint64_t>(float *, const float, const float * const, const float, const float * const, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index, const int, const int);
356template void Apply::bcsr_wrapper_cuda<double, std::uint64_t>(double *, const double, const double * const, const double, const double * const, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index, const int, const int);
358template <int BlockSize_, typename DT_, typename IT_>
359void Apply::csrsb_cuda(DT_ * r, const DT_ a, const DT_ * const x, const DT_ b, const DT_ * const y, const DT_ * const val, const IT_ * const col_ind, const IT_ * const row_ptr, const Index rows,
360 const Index /*columns*/, const Index /*used_elements*/)
362 Index blocksize = Util::cuda_blocksize_spmv;
365 block.x = (unsigned)blocksize;
366 grid.x = (unsigned)ceil((rows)/(double)(block.x));
368 if (Math::abs(b) < Math::eps<DT_>())
370 MemoryPool::set_memory(r, DT_(0), /*(transposed?columns:rows)*/ rows * BlockSize_);
374 MemoryPool::copy(r, y, /*(transposed?columns:rows)*/ rows * BlockSize_);
377 FEAT::LAFEM::Intern::cuda_apply_csrsb<BlockSize_, DT_, IT_><<<grid, block>>>(r, a, x, b, val, col_ind, row_ptr, rows);
379 cudaDeviceSynchronize();
380#ifdef FEAT_DEBUG_MODE
381 cudaError_t last_error(cudaGetLastError());
382 if (cudaSuccess != last_error)
383 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
386template void Apply::csrsb_cuda<1, float, std::uint64_t>
387 (float *, const float, const float *, const float, const float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index);
388template void Apply::csrsb_cuda<1, double, std::uint64_t>
389 (double *, const double, const double *, const double, const double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index);
390template void Apply::csrsb_cuda<1, float, std::uint32_t>
391 (float *, const float, const float *, const float, const float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index);
392template void Apply::csrsb_cuda<1, double, std::uint32_t>
393 (double *, const double, const double *, const double, const double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index);
394template void Apply::csrsb_cuda<2, float, std::uint64_t>
395 (float *, const float, const float *, const float, const float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index);
396template void Apply::csrsb_cuda<2, double, std::uint64_t>
397 (double *, const double, const double *, const double, const double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index);
398template void Apply::csrsb_cuda<2, float, std::uint32_t>
399 (float *, const float, const float *, const float, const float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index);
400template void Apply::csrsb_cuda<2, double, std::uint32_t>
401 (double *, const double, const double *, const double, const double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index);
402template void Apply::csrsb_cuda<3, float, std::uint64_t>
403 (float *, const float, const float *, const float, const float *, const float * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index);
404template void Apply::csrsb_cuda<3, double, std::uint64_t>
405 (double *, const double, const double *, const double, const double *, const double * const, const std::uint64_t * const, const std::uint64_t * const, const Index, const Index, const Index);
406template void Apply::csrsb_cuda<3, float, std::uint32_t>
407 (float *, const float, const float *, const float, const float *, const float * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index);
408template void Apply::csrsb_cuda<3, double, std::uint32_t>
409 (double *, const double, const double *, const double, const double *, const double * const, const std::uint32_t * const, const std::uint32_t * const, const Index, const Index, const Index);
411template <typename DT_, typename IT_>
412void Apply::banded_cuda(DT_ * r, const DT_ alpha, const DT_ * const x, const DT_ beta, const DT_ * const y, const DT_ * const val, const IT_ * const offsets, const Index num_of_offsets, const Index rows, const Index columns)
414 Index blocksize = Util::cuda_blocksize_spmv;
417 block.x = (unsigned)blocksize;
418 grid.x = (unsigned)ceil((rows)/(double)(block.x));
420 if (Math::abs(beta) < Math::eps<DT_>())
422 MemoryPool::set_memory(r, DT_(0), rows);
426 MemoryPool::copy(r, y, rows);
429 FEAT::LAFEM::Intern::cuda_apply_banded<<<grid, block>>>(r, alpha, x, beta, val, offsets, num_of_offsets, rows, columns);
431template void Apply::banded_cuda(float *, const float, const float * const, const float, const float * const, const float * const, const std::uint32_t * const, const Index, const Index, const Index);
432template void Apply::banded_cuda(double *, const double, const double * const, const double, const double * const, const double * const, const std::uint32_t * const, const Index, const Index, const Index);
433template void Apply::banded_cuda(float *, const float, const float * const, const float, const float * const, const float * const, const std::uint64_t * const, const Index, const Index, const Index);
434template void Apply::banded_cuda(double *, const double, const double * const, const double, const double * const, const double * const, const std::uint64_t * const, const Index, const Index, const Index);
436template <typename DT_>
437void Apply::dense_cuda(DT_ * r, const DT_ alpha, const DT_ beta, const DT_ * const y, const DT_ * const val, const DT_ * const x, const Index rows, const Index columns)
441 Util::cuda_copy_device_to_device(r, y, rows * sizeof(DT_));
444 FEAT::LAFEM::Intern::cublas_apply_dense(CUBLAS_OP_T, (int)rows, (int)columns, &alpha, val, x, &beta, r);
446 cudaDeviceSynchronize();
447#ifdef FEAT_DEBUG_MODE
448 cudaError_t last_error(cudaGetLastError());
449 if (cudaSuccess != last_error)
450 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
453template void Apply::dense_cuda(float * r, const float, const float, const float * const, const float * const, const float * const, const Index, const Index);
454template void Apply::dense_cuda(double * r, const double, const double, const double * const, const double * const, const double * const, const Index, const Index);