1// FEAT3: Finite Element Analysis Toolbox, Version 3
2// Copyright (C) 2010 by Stefan Turek & the FEAT group
3// FEAT3 is released under the GNU General Public License version 3,
4// see the file 'copyright.txt' in the top level directory for details.
7#include <kernel/base_header.hpp>
9#include <kernel/util/exception.hpp>
10#include <kernel/util/memory_pool.hpp>
11#include <kernel/util/cuda_util.hpp>
19#include "cusparse_v2.h"
27 __global__ void cuda_sor_apply_kernel(int m, double * y, const double * x,
28 const double * csrVal, const int * csrRowPtr, const int * csrColInd, double omega, int * inverseRowPtr)
30 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
34 int row = inverseRowPtr[idx];
37 for (col = csrRowPtr[idx*2] ; csrColInd[col] < row ; ++col)
39 d += csrVal[col] * y[csrColInd[col]];
41 y[row] = omega * (x[row] - d) / csrVal[col];
48 struct InverseHelper<2>
50 __device__ static void compute(double (&inv)[2][2], const double * a)
52 double d = double(1) / (a[0]*a[3] - a[1]*a[2]);
61 struct InverseHelper<3>
63 __device__ static void compute(double (&inv)[3][3], const double * a)
65 inv[0][0] = a[4]*a[8] - a[5]*a[7];
66 inv[1][0] = a[5]*a[6] - a[3]*a[8];
67 inv[2][0] = a[3]*a[7] - a[4]*a[6];
68 double d = double(1) / (a[0]*inv[0][0] + a[1]*inv[1][0] + a[2]*inv[2][0]);
72 inv[0][1] = d*(a[2]*a[7] - a[1]*a[8]);
73 inv[1][1] = d*(a[0]*a[8] - a[2]*a[6]);
74 inv[2][1] = d*(a[1]*a[6] - a[0]*a[7]);
75 inv[0][2] = d*(a[1]*a[5] - a[2]*a[4]);
76 inv[1][2] = d*(a[2]*a[3] - a[0]*a[5]);
77 inv[2][2] = d*(a[0]*a[4] - a[1]*a[3]);
81 template <int BlockSize_>
82 __global__ void cuda_sor_bcsr_apply_kernel(int m, double * y, const double * x,
83 const double * csrVal, const int * csrRowPtr, const int * csrColInd, double omega, int * inverseRowPtr)
85 Index idx = threadIdx.x + blockDim.x * blockIdx.x;
89 int row = inverseRowPtr[idx];
91 for (int j(0) ; j < BlockSize_ ; ++j)
96 for (col = csrRowPtr[idx*2] ; csrColInd[col] < row ; ++col)
98 for (int h(0) ; h < BlockSize_ ; ++h)
100 for (int w(0) ; w < BlockSize_ ; ++w)
102 d[h] += csrVal[col * BlockSize_ * BlockSize_ + h * BlockSize_ + w] * y[csrColInd[col] * BlockSize_ + w];
107 double inv[BlockSize_][BlockSize_];
108 InverseHelper<BlockSize_>::compute(inv, csrVal + col * BlockSize_ * BlockSize_);
110 //y[row * BlockSize_ + j] = omega * (x[row * BlockSize_ + j] - d[j]) / csrVal[col];
111 double temp[BlockSize_];
112 for (int j(0) ; j < BlockSize_ ; ++j)
114 temp[j] = x[row * BlockSize_ + j] - d[j];
117 for (int h(0) ; h < BlockSize_ ; ++h)
120 for (int w(0) ; w < BlockSize_ ; ++w)
122 r += inv[h][w] * temp[w];
124 y[row * BlockSize_ + h] = omega * r;
128 void cuda_sor_init_symbolic(int m, int nnz, const double * csrVal, const int * csrRowPtr, const int * csrColInd, int & ncolors, int* & colored_row_ptr, int* & rows_per_color, int* & inverse_row_ptr)
130 cusparseColorInfo_t cinfo;
131 cusparseStatus_t status = cusparseCreateColorInfo(&cinfo);
132 if (status != CUSPARSE_STATUS_SUCCESS)
133 throw InternalError(__func__, __FILE__, __LINE__, "cusparseCreateColorInfo failed with status code: " + stringify(status));
135 int * d_coloring = (int*)Util::cuda_malloc(m * sizeof(int));
138 cusparseMatDescr_t descr_M = 0;
139 cusparseCreateMatDescr(&descr_M);
140 cusparseSetMatIndexBase(descr_M, CUSPARSE_INDEX_BASE_ZERO);
141 cusparseSetMatType(descr_M, CUSPARSE_MATRIX_TYPE_GENERAL);
143 status = cusparseDcsrcolor(Util::Intern::cusparse_handle, m, nnz, descr_M,
144 csrVal, csrRowPtr, csrColInd, &one, &ncolors, d_coloring, NULL, cinfo);
145 if (status != CUSPARSE_STATUS_SUCCESS)
146 throw InternalError(__func__, __FILE__, __LINE__, "cusparseDcsrcolor failed with status code: " + stringify(status));
148 status = cusparseDestroyColorInfo(cinfo);
149 if (status != CUSPARSE_STATUS_SUCCESS)
150 throw InternalError(__func__, __FILE__, __LINE__, "cusparseDestroyColorInfo failed with status code: " + stringify(status));
152 //std::cout<<"pre colors: "<<ncolors<<" rows: "<<m<<std::endl;
153 int * coloring = new int[m];
154 Util::cuda_copy_device_to_host(coloring, d_coloring, m * sizeof(int));
155 Util::cuda_free(d_coloring);
157 //decrement from non existing colors
158 for (int color(0) ; color < ncolors ; ++color)
160 //search for each color
162 for (int i(0) ; i < m ; ++i)
164 if (coloring[i] == color)
170 // if color was not found, decrement all remaining colors and retry search with the same color again
173 for (int i(0) ; i < m ; ++i)
175 if (coloring[i] > color)
185 /*std::cout<<"row to color:"<<std::endl;
186 for (int i(0) ; i < m ; ++i)
188 std::cout<<coloring[i]<<" ";
190 std::cout<<std::endl;*/
191 //std::cout<<"colors: "<<ncolors<<" rows: "<<m<<std::endl;
193 // count rows per color
194 //rows_per_color = MemoryPool::template allocate_memory<int>(ncolors);
195 rows_per_color = new int[ncolors];
196 for (int i(0) ; i < ncolors ; ++i)
198 rows_per_color[i] = 0;
200 for (int i(0) ; i < m ; ++i)
202 rows_per_color[coloring[i]] += 1;
205 int * colors_ascending = MemoryPool::template allocate_memory<int>(ncolors);
206 //vector of pair<rows, color>
207 std::vector<std::pair<int, int>> temp;
208 for (int i(0) ; i < ncolors ; ++i)
210 temp.push_back(std::make_pair(rows_per_color[i], i));
212 std::sort(temp.begin(), temp.end());
214 for (int i(0) ; i < ncolors ; ++i)
216 colors_ascending[i] = temp[i].second;
219 //resort rows per color by ascending row count
220 for (int i(0) ; i < ncolors ; ++i)
222 rows_per_color[i] = temp[i].first;
224 /*std::cout<<"colors ascending: "<<std::endl;
225 for (int i(0) ; i < ncolors ; ++i)
226 std::cout<<i<<" "<<colors_ascending[i]<<std::endl;
227 std::cout<<std::endl;
229 std::cout<<"rows per color:"<<std::endl;
230 for (int i(0) ; i < ncolors ; ++i)
232 std::cout<<i<<" "<<rows_per_color[i]<<std::endl;
234 std::cout<<std::endl;*/
236 int * host_irp = MemoryPool::template allocate_memory<int>(m);
237 int * host_crp = MemoryPool::template allocate_memory<int>(2*m);
238 int * host_row_ptr = MemoryPool::template allocate_memory<int>(m+1);
239 Util::cuda_copy_device_to_host(host_row_ptr, csrRowPtr, (m+1) * sizeof(int));
241 //iterate over all colors, by ascending row count
242 int crp_i(0); //index into host_crp
243 for (int color_i(0) ; color_i < ncolors ; ++color_i)
245 int color = colors_ascending[color_i];
246 //search all rows with matching color
247 for (int row(0) ; row < m ; ++row)
249 if (coloring[row] == color)
251 host_crp[crp_i*2] = host_row_ptr[row];
252 host_crp[crp_i*2+1] = host_row_ptr[row+1];
253 host_irp[crp_i] = row;
259 MemoryPool::release_memory(host_row_ptr);
261 inverse_row_ptr = (int*)Util::cuda_malloc(m * sizeof(int));
262 Util::cuda_copy_host_to_device(inverse_row_ptr, host_irp, m * sizeof(int));
264 colored_row_ptr = (int*)Util::cuda_malloc(2 * m * sizeof(int));
265 Util::cuda_copy_host_to_device(colored_row_ptr, host_crp, 2 * m * sizeof(int));
268 MemoryPool::release_memory(colors_ascending);
269 MemoryPool::release_memory(host_irp);
270 MemoryPool::release_memory(host_crp);
272 cudaDeviceSynchronize();
273#ifdef FEAT_DEBUG_MODE
274 cudaError_t last_error(cudaGetLastError());
275 if (cudaSuccess != last_error)
276 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
280 void cuda_sor_done_symbolic(int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr)
282 Util::cuda_free(colored_row_ptr);
283 Util::cuda_free(inverse_row_ptr);
284 //MemoryPool::release_memory(rows_per_color);
285 delete[] rows_per_color;
287 cudaDeviceSynchronize();
288#ifdef FEAT_DEBUG_MODE
289 cudaError_t last_error(cudaGetLastError());
290 if (cudaSuccess != last_error)
291 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
295 int cuda_sor_apply(int m, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
296 int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr)
298 cudaMemset(y, 0, m * sizeof(double));
301 for (int i(0) ; i < ncolors ; ++i)
303 Index blocksize = Util::cuda_blocksize_spmv;
306 block.x = (unsigned)blocksize;
307 grid.x = (unsigned)ceil((rows_per_color[i])/(double)(block.x));
309 cuda_sor_apply_kernel<<<grid, block>>>(rows_per_color[i], y, x, csrVal, colored_row_ptr + row_offset * 2, csrColInd, omega, inverse_row_ptr + row_offset);
310 row_offset += rows_per_color[i];
313 cudaDeviceSynchronize();
314#ifdef FEAT_DEBUG_MODE
315 cudaError_t last_error(cudaGetLastError());
316 if (cudaSuccess != last_error)
317 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
323 template<int BlockSize_>
324 int cuda_sor_bcsr_apply(int m, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
325 int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr)
327 cudaMemset(y, 0, m * sizeof(double));
330 for (int i(0) ; i < ncolors ; ++i)
332 Index blocksize = Util::cuda_blocksize_spmv;
335 block.x = (unsigned)blocksize;
336 grid.x = (unsigned)ceil((rows_per_color[i])/(double)(block.x));
338 cuda_sor_bcsr_apply_kernel<BlockSize_><<<grid, block>>>(rows_per_color[i], y, x, csrVal, colored_row_ptr + row_offset * 2, csrColInd, omega, inverse_row_ptr + row_offset);
339 row_offset += rows_per_color[i];
342 cudaDeviceSynchronize();
343#ifdef FEAT_DEBUG_MODE
344 cudaError_t last_error(cudaGetLastError());
345 if (cudaSuccess != last_error)
346 throw InternalError(__func__, __FILE__, __LINE__, "CUDA error occurred in execution!\n" + stringify(cudaGetErrorString(last_error)));
351 // cuda_sor_bcsr_apply_kernel is hardcoded for BS_ == 2 && BS_ == 3
352 template int cuda_sor_bcsr_apply<2>(int m, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
353 int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr);
354 template int cuda_sor_bcsr_apply<3>(int m, double * y, const double * x, const double * csrVal, const int * csrColInd, int ncolors, double omega,
355 int * colored_row_ptr, int * rows_per_color, int * inverse_row_ptr);