1// FEAT3: Finite Element Analysis Toolbox, Version 3
 
    2// Copyright (C) 2010 by Stefan Turek & the FEAT group
 
    3// FEAT3 is released under the GNU General Public License version 3,
 
    4// see the file 'copyright.txt' in the top level directory for details.
 
    6#include <kernel/base_header.hpp>
 
    7#include <kernel/util/cuda_util.hpp>
 
    8#include <kernel/voxel_assembly/poisson_assembler.hpp>
 
    9#include <kernel/lafem/matrix_gather_scatter_helper.hpp>
 
   10#include <kernel/voxel_assembly/helper/space_helper.hpp>
 
   11#include <kernel/util/tiny_algebra.hpp>
 
   15  namespace VoxelAssembly
 
   19      /**************************************************************************************************************/
 
   21      /**************************************************************************************************************/
 
   24      * \brief Cuda kernel poisson csr matrix assembly
 
   26      * \tparam Space_ The underlying spacetype.
 
   27      * \tparam DT_ The datatype used.
 
   28      * \tparam IT_ The indextype.
 
   29      * \tparam pol_ The scatter and gather policy. See MatrixGatherScatterPolicy for details.
 
   31      * \warning All ptr used here have to be unified or device pointer.
 
   32      * \param[out] matrix_data The matrix data array initilized as a unified memory array. Is added upon, so format before!
 
   33      * \param[in] matrix_row_ptr Row ptr of underlying csr matrix.
 
   34      * \param[in] matrix_col_idx Column index array of underlying csr matrix.
 
   35      * \param[in] matrix_num_rows Number of rows of csr matrix.
 
   36      * \param[in] matrix_num_cols Number of columns of csr matrix.
 
   37      * \param[in] cub_pt Array of cubature coordinate points.
 
   38      * \param[in] cub_wg Array of cubature weights.
 
   39      * \param[in] num_cubs Number of cubature points.
 
   40      * \param[in] alpha Scaling parameter for assembly.
 
   41      * \param[in] cell_to_dof Mapping cell to dof index.
 
   42      * \param[in] cell_num Number of cells.
 
   43      * \param[in] nodes Array of node points. Needed for local trafo assembly. /TODO: Version with precalced trafos.
 
   44      * \param[in] node_size Number of nodes.
 
   45      * \param[in] coloring_map Array of cell mapping for this color.
 
   46      * \param[in] coloring_size Size of the coloring array.
 
   47      * \param[in] cell_to_dof_sorter Depending on the used policy, this should either be nullptr, a local sorted array (or a col_ptr)
 
   49      template<typename Space_, typename DT_, typename IT_, FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
 
   50      __global__ void poisson_assembler_matrix1_csr(DT_* __restrict__ matrix_data,
 
   51                const IT_* __restrict__  matrix_row_ptr, const IT_* __restrict__ matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
 
   52                const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
 
   53                const DT_* __restrict__  cub_wg, Index num_cubs, DT_ alpha,
 
   54                const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
 
   55                const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
 
   56                const int* __restrict__ coloring_map, Index coloring_size, const IT_* __restrict__ cell_to_dof_sorter)
 
   58        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   59        if(idx >= coloring_size)
 
   62        typedef Space_ SpaceType;
 
   64        typedef IT_ IndexType;
 
   66        typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
 
   68        constexpr int dim = SpaceHelp::dim;
 
   71        constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
 
   72        // get number of nodes per element
 
   73        constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
 
   74        // Our local matrix type
 
   75        typedef Tiny::Matrix<DataType, num_loc_dofs, num_loc_dofs> LocMatType;
 
   77        // define local coeffs
 
   78        Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
 
   80        // define local arrays
 
   81        Tiny::Matrix<DataType, num_loc_dofs, num_loc_dofs> loc_mat;
 
   83        typename SpaceHelp::EvalData basis_data;
 
   85        // now do work for this cell
 
   86        const Index cell = Index(coloring_map[idx]);
 
   87        const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
 
   88        const IndexType* local_dofs = cell_to_dof + cell*num_loc_dofs;
 
   89        const IndexType* local_dof_sorter = cell_to_dof_sorter + cell*num_loc_dofs;
 
   91        // printf("On cell %i My local dofs: ", cell);
 
   92        // for(int i = 0; i < num_loc_dofs; ++i)
 
   94        //   printf("%i ", *(local_dofs + i));
 
   97        SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
 
   99        // To minimize code repetition, we call the same kernel from cuda and non cuda build...
 
  100        VoxelAssembly::Kernel::poisson_assembly_kernel<SpaceHelp, LocMatType>(loc_mat, local_coeffs, cub_pt, cub_wg, num_cubs);
 
  103        LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::scatter_matrix_csr(loc_mat, matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, alpha, local_dof_sorter);
 
  108      /**************************************************************************************************************/
 
  109      /*                                       CUDA Host OMP Kernels                                                */
 
  110      /**************************************************************************************************************/
 
  111      /** \copydoc(poisson_assembler_matrix1_csr())*/
 
  112      template<typename Space_, typename DT_, typename IT_, FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
 
  113      void poisson_assembler_matrix1_csr_host(DT_*  matrix_data,
 
  114                const IT_*  matrix_row_ptr, const IT_*  matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
 
  115                const Tiny::Vector<DT_, Space_::world_dim>*  cub_pt,
 
  116                const DT_*  cub_wg, Index num_cubs, DT_ alpha,
 
  117                const IT_*  cell_to_dof, [[maybe_unused]] Index cell_num,
 
  118                const Tiny::Vector<DT_, Space_::world_dim>*  nodes, [[maybe_unused]] Index node_size,
 
  119                const int*  coloring_map, Index coloring_size,
 
  120                const IT_* cell_to_dof_sorter)
 
  123        typedef Space_ SpaceType;
 
  124        typedef DT_ DataType;
 
  125        typedef IT_ IndexType;
 
  127        typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
 
  129        constexpr int dim = SpaceHelp::dim;
 
  131        // define local sizes
 
  132        constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
 
  133        // get number of nodes per element
 
  134        constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
 
  135        // Our local matrix type
 
  136        typedef Tiny::Matrix<DataType, num_loc_dofs, num_loc_dofs> LocMatType;
 
  139        FEAT_PRAGMA_OMP(parallel for)
 
  140        for(Index idx = 0; idx < coloring_size; ++idx)
 
  142          // define local coefficients
 
  143          Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
 
  145          Tiny::Matrix<DataType, num_loc_dofs, num_loc_dofs> loc_mat;
 
  146          // now do work for this cell
 
  147          int cell = coloring_map[idx];
 
  148          // std::cout << "Starting with cell " << cell << std::endl;
 
  149          const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
 
  150          const IndexType* local_dofs = cell_to_dof + IndexType(cell)*num_loc_dofs;
 
  151          const IndexType* local_dof_sorter = cell_to_dof_sorter + IndexType(cell)*num_loc_dofs;
 
  153          SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
 
  155          // To minimize code repetition, we call the same kernel from cuda and non cuda build...
 
  156          VoxelAssembly::Kernel::poisson_assembly_kernel<SpaceHelp, LocMatType>(loc_mat, local_coeffs, cub_pt, cub_wg, int(num_cubs));
 
  159          LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::scatter_matrix_csr(loc_mat, matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, alpha, local_dof_sorter);
 
  168      template<typename Space_, typename DT_, typename IT_>
 
  169      void assemble_poisson_csr([[maybe_unused]] const Space_& space,
 
  170              const CSRMatrixData<DT_, IT_>& matrix_data,
 
  171              const AssemblyCubatureData<DT_>& cubature,
 
  172              const AssemblyMappingData<DT_, IT_>& dof_mapping,
 
  173              const std::vector<int*>& coloring_maps,
 
  174              const std::vector<Index>& coloring_map_sizes,
 
  177        const Index blocksize = Util::cuda_blocksize_spmv;
 
  178        // const Index blocksize = 64;
 
  180        for(Index i = 0; i < coloring_maps.size(); ++i)
 
  184          block.x = (unsigned int)blocksize;
 
  185          grid.x = (unsigned int)ceil(double(coloring_map_sizes[i])/double(block.x));
 
  187          //kernel call, since this uses the standard stream, sync before next call is enforced:
 
  188          VoxelAssembly::Kernel::template poisson_assembler_matrix1_csr<Space_, DT_, IT_, FEAT::Intern::MatrixGatherScatterPolicy::useLocalSortHelper><<< grid, block >>>(
 
  189              matrix_data.data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
 
  190              (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
 
  191              cubature.cub_wg, cubature.num_cubs, alpha,
 
  192              dof_mapping.cell_to_dof, dof_mapping.cell_num,
 
  193              (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
 
  194              (const int*) coloring_maps[i], coloring_map_sizes[i], dof_mapping.cell_to_dof_sorter
 
  198        //check for cuda error in our kernel
 
  199        Util::cuda_check_last_error();
 
  202      template<typename Space_, typename DT_, typename IT_>
 
  203      void assemble_poisson_csr_host([[maybe_unused]] const Space_& space,
 
  204              const CSRMatrixData<DT_, IT_>& matrix_data,
 
  205              const AssemblyCubatureData<DT_>& cubature,
 
  206              const AssemblyMappingData<DT_, IT_>& dof_mapping,
 
  207              const std::vector<int*>& coloring_maps,
 
  208              const std::vector<Index>& coloring_map_sizes,
 
  211        for(Index col = 0; col < Index(coloring_maps.size()); ++col)
 
  213          VoxelAssembly::Kernel::template poisson_assembler_matrix1_csr_host<Space_, DT_, IT_, FEAT::Intern::MatrixGatherScatterPolicy::useLocalSortHelper>(
 
  214            matrix_data.data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
 
  215            (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
 
  216            cubature.cub_wg, cubature.num_cubs, alpha, dof_mapping.cell_to_dof, dof_mapping.cell_num,
 
  217            (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
 
  218            (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter
 
  227using namespace FEAT::VoxelAssembly;
 
  229/*--------------------Poisson Assembler Q2Quad-------------------------------------------------*/
 
  230template void Arch::assemble_poisson_csr(const Q2StandardQuad&, const CSRMatrixData<double, std::uint32_t>&, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
 
  231                                          const std::vector<int*>&, const std::vector<Index>&, double);
 
  232template void Arch::assemble_poisson_csr(const Q2StandardQuad&, const CSRMatrixData<float, std::uint32_t>&, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
 
  233                                          const std::vector<int*>&, const std::vector<Index>&, float);
 
  234template void Arch::assemble_poisson_csr(const Q2StandardQuad&, const CSRMatrixData<double, std::uint64_t>&, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
 
  235                                          const std::vector<int*>&, const std::vector<Index>&, double);
 
  236template void Arch::assemble_poisson_csr(const Q2StandardQuad&, const CSRMatrixData<float, std::uint64_t>&, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
 
  237                                          const std::vector<int*>&, const std::vector<Index>&, float);
 
  238// #ifdef FEAT_HAVE_HALFMATH
 
  239// template void Arch::assemble_poisson_csr(const Q2StandardQuad&, const CSRMatrixData<Half, std::uint32_t>&, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
 
  240//                                           const std::vector<int*>&, const std::vector<Index>&, Half);
 
  241// template void Arch::assemble_poisson_csr(const Q2StandardQuad&, const CSRMatrixData<Half, std::uint64_t>&, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
 
  242//                                           const std::vector<int*>&, const std::vector<Index>&, Half);
 
  245template void Arch::assemble_poisson_csr_host(const Q2StandardQuad&, const CSRMatrixData<double, std::uint32_t>&, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
 
  246                                          const std::vector<int*>&, const std::vector<Index>&, double);
 
  247template void Arch::assemble_poisson_csr_host(const Q2StandardQuad&, const CSRMatrixData<float, std::uint32_t>&, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
 
  248                                          const std::vector<int*>&, const std::vector<Index>&, float);
 
  249template void Arch::assemble_poisson_csr_host(const Q2StandardQuad&, const CSRMatrixData<double, std::uint64_t>&, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
 
  250                                          const std::vector<int*>&, const std::vector<Index>&, double);
 
  251template void Arch::assemble_poisson_csr_host(const Q2StandardQuad&, const CSRMatrixData<float, std::uint64_t>&, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
 
  252                                          const std::vector<int*>&, const std::vector<Index>&, float);
 
  254/*---------------------Poisson Assembler Q2Hexa------------------------------------------------------*/
 
  255template void Arch::assemble_poisson_csr(const Q2StandardHexa&, const CSRMatrixData<double, std::uint32_t>&, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
 
  256                                          const std::vector<int*>&, const std::vector<Index>&, double);
 
  257template void Arch::assemble_poisson_csr(const Q2StandardHexa&, const CSRMatrixData<float, std::uint32_t>&, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
 
  258                                          const std::vector<int*>&, const std::vector<Index>&, float);
 
  259template void Arch::assemble_poisson_csr(const Q2StandardHexa&, const CSRMatrixData<double, std::uint64_t>&, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
 
  260                                          const std::vector<int*>&, const std::vector<Index>&, double);
 
  261template void Arch::assemble_poisson_csr(const Q2StandardHexa&, const CSRMatrixData<float, std::uint64_t>&, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
 
  262                                          const std::vector<int*>&, const std::vector<Index>&, float);
 
  263// #ifdef FEAT_HAVE_HALFMATH
 
  264// template void Arch::assemble_poisson_csr(const Q2StandardHexa&, const CSRMatrixData<Half, std::uint32_t>&, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
 
  265//                                           const std::vector<int*>&, const std::vector<Index>&, Half);
 
  266// template void Arch::assemble_poisson_csr(const Q2StandardHexa&, const CSRMatrixData<Half, std::uint64_t>&, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
 
  267//                                           const std::vector<int*>&, const std::vector<Index>&, Half);
 
  270template void Arch::assemble_poisson_csr_host(const Q2StandardHexa&, const CSRMatrixData<double, std::uint32_t>&, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
 
  271                                          const std::vector<int*>&, const std::vector<Index>&, double);
 
  272template void Arch::assemble_poisson_csr_host(const Q2StandardHexa&, const CSRMatrixData<float, std::uint32_t>&, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
 
  273                                          const std::vector<int*>&, const std::vector<Index>&, float);
 
  274template void Arch::assemble_poisson_csr_host(const Q2StandardHexa&, const CSRMatrixData<double, std::uint64_t>&, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
 
  275                                          const std::vector<int*>&, const std::vector<Index>&, double);
 
  276template void Arch::assemble_poisson_csr_host(const Q2StandardHexa&, const CSRMatrixData<float, std::uint64_t>&, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
 
  277                                          const std::vector<int*>&, const std::vector<Index>&, float);