1// FEAT3: Finite Element Analysis Toolbox, Version 3
 
    2// Copyright (C) 2010 by Stefan Turek & the FEAT group
 
    3// FEAT3 is released under the GNU General Public License version 3,
 
    4// see the file 'copyright.txt' in the top level directory for details.
 
    6#include <kernel/base_header.hpp>
 
    7#include <kernel/util/cuda_util.hpp>
 
    8#include <kernel/voxel_assembly/defo_assembler.hpp>
 
    9#include <kernel/lafem/matrix_gather_scatter_helper.hpp>
 
   10#include <kernel/voxel_assembly/helper/space_helper.hpp>
 
   11#include <kernel/util/tiny_algebra.hpp>
 
   15  namespace VoxelAssembly
 
   19      /**************************************************************************************************************/
 
   21      /**************************************************************************************************************/
 
   23      template<typename Space_, typename DT_, typename IT_, FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
 
   24      __global__ void defo_assembler_matrix1_bcsr(DT_* __restrict__ matrix_data,
 
   25                const IT_* __restrict__  matrix_row_ptr, const IT_* __restrict__ matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
 
   26                const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ cub_pt,
 
   27                const DT_* __restrict__  cub_wg, int num_cubs, DT_ alpha,
 
   28                const IT_* __restrict__ cell_to_dof, [[maybe_unused]] Index cell_num,
 
   29                const Tiny::Vector<DT_, Space_::world_dim>* __restrict__ nodes, [[maybe_unused]] Index node_size,
 
   30                const int* __restrict__ coloring_map, Index coloring_size, const IT_* __restrict__ cell_to_dof_sorter, DT_ nu)
 
   32        Index idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   33        if(idx >= coloring_size)
 
   36        typedef Space_ SpaceType;
 
   38        typedef IT_ IndexType;
 
   40        typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
 
   42        constexpr int dim = SpaceHelp::dim;
 
   45        constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
 
   46        // get number of nodes per element
 
   47        constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
 
   49        // typedef Tiny::Vector<DataType, dim> VecValueType;
 
   50        typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
 
   51        // typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
 
   52        typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
 
   54        // define local coefficients
 
   55        Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
 
   57        LocalMatrixType loc_mat;
 
   60        //now do work for this cell
 
   61        Index cell = Index(coloring_map[idx]);
 
   62        const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
 
   63        const IndexType* local_dofs = cell_to_dof + cell*num_loc_dofs;
 
   64        const IndexType* local_dof_sorter = cell_to_dof_sorter + cell*num_loc_dofs;
 
   65        SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
 
   67        // To minimize code repetition, we call the same kernel from cuda and non cuda build...
 
   68        VoxelAssembly::Kernel::defo_assembly_kernel<SpaceHelp, LocalMatrixType>(loc_mat, local_coeffs, cub_pt, cub_wg, num_cubs, nu);
 
   71        LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::scatter_matrix_csr(loc_mat, (MatValueType*)matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, alpha, local_dof_sorter);
 
   76      /**************************************************************************************************************/
 
   77      /*                                       CUDA Host OMP Kernels                                                */
 
   78      /**************************************************************************************************************/
 
   80      template<typename Space_, typename DT_, typename IT_, FEAT::Intern::MatrixGatherScatterPolicy pol_ = FEAT::Intern::MatrixGatherScatterPolicy::useLocalOps>
 
   81      void defo_assembler_matrix1_csr_host(DT_*  matrix_data,
 
   82                const IT_*  matrix_row_ptr, const IT_*  matrix_col_idx, Index matrix_num_rows, Index matrix_num_cols,
 
   83                const Tiny::Vector<DT_, Space_::world_dim>*  cub_pt,
 
   84                const DT_*  cub_wg, int num_cubs, DT_ alpha,
 
   85                const IT_*  cell_to_dof, [[maybe_unused]] Index cell_num,
 
   86                const Tiny::Vector<DT_, Space_::world_dim>*  nodes, [[maybe_unused]] Index node_size,
 
   87                const int*  coloring_map, Index coloring_size,
 
   88                const IT_* cell_to_dof_sorter, DT_ nu)
 
   91        typedef Space_ SpaceType;
 
   93        typedef IT_ IndexType;
 
   95        typedef VoxelAssembly::SpaceHelper<SpaceType, DT_, IT_> SpaceHelp;
 
   97        constexpr int dim = SpaceHelp::dim;
 
  100        constexpr int num_loc_dofs = SpaceType::DofMappingType::dof_count;
 
  101        // get number of nodes per element
 
  102        constexpr int num_loc_verts = SpaceType::MeshType::template IndexSet<dim, 0>::Type::num_indices;
 
  103        //our local datatypes
 
  104        // typedef Tiny::Vector<DataType, dim> VecValueType;
 
  105        typedef Tiny::Matrix<DataType, dim, dim> MatValueType;
 
  106        // typedef Tiny::Vector<VecValueType, num_loc_dofs> LocalVectorType;
 
  107        typedef Tiny::Matrix<MatValueType, num_loc_dofs, num_loc_dofs> LocalMatrixType;
 
  111        FEAT_PRAGMA_OMP(parallel for)
 
  112        for(Index idx = 0; idx < coloring_size; ++idx)
 
  114          // define local coefficients
 
  115          Tiny::Matrix<DataType, dim, num_loc_verts> local_coeffs;
 
  117          LocalMatrixType loc_mat;
 
  118          // now do work for this cell
 
  119          int cell = coloring_map[idx];
 
  120          // std::cout << "Starting with cell " << cell << std::endl;
 
  121          const IndexSetWrapper<IndexType> local_dofs_w{cell_to_dof, IndexType(num_loc_dofs)};
 
  122          const IndexType* local_dofs = cell_to_dof + IndexType(cell)*num_loc_dofs;
 
  123          const IndexType* local_dof_sorter = cell_to_dof_sorter + IndexType(cell)*num_loc_dofs;
 
  125          SpaceHelp::set_coefficients(local_coeffs, local_dofs_w, nodes, cell);
 
  127          // To minimize code repetition, we call the same kernel from cuda and non cuda build...
 
  128          VoxelAssembly::Kernel::defo_assembly_kernel<SpaceHelp, LocalMatrixType>(loc_mat, local_coeffs, cub_pt, cub_wg, num_cubs, nu);
 
  131          LAFEM::template MatrixGatherScatterHelper<SpaceType, DataType, IndexType, pol_>::scatter_matrix_csr(loc_mat, (MatValueType*)matrix_data, local_dofs, local_dofs, IndexType(matrix_num_rows), IndexType(matrix_num_cols), matrix_row_ptr, matrix_col_idx, alpha, local_dof_sorter);
 
  137    /**************************************************************************************************************/
 
  138    /*                                       CUDA kernel wrapper                                                  */
 
  139    /**************************************************************************************************************/
 
  143      template<typename Space_, typename DT_, typename IT_>
 
  144      void assemble_defo_csr([[maybe_unused]] const Space_& space,
 
  145              const CSRMatrixData<DT_, IT_>& matrix_data,
 
  146              const AssemblyCubatureData<DT_>& cubature,
 
  147              const AssemblyMappingData<DT_, IT_>& dof_mapping,
 
  148              const std::vector<int*>& coloring_maps,
 
  149              const std::vector<Index>& coloring_map_sizes,
 
  153        const Index blocksize = Util::cuda_blocksize_blocked_assembly;
 
  154        // const Index blocksize = 64;
 
  156        for(Index i = 0; i < coloring_maps.size(); ++i)
 
  160          block.x = (unsigned int)blocksize;
 
  161          grid.x = (unsigned int)ceil(double(coloring_map_sizes[i])/double(block.x));
 
  163          //kernel call, since this uses the standard stream, sync before next call is enforced:
 
  164          VoxelAssembly::Kernel::template defo_assembler_matrix1_bcsr<Space_, DT_, IT_, FEAT::Intern::MatrixGatherScatterPolicy::useLocalSortHelper><<< grid, block >>>(
 
  165              matrix_data.data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
 
  166              (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
 
  167              cubature.cub_wg, cubature.num_cubs, alpha,
 
  168              dof_mapping.cell_to_dof, dof_mapping.cell_num,
 
  169              (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
 
  170              (const int*) coloring_maps[i], coloring_map_sizes[i], dof_mapping.cell_to_dof_sorter, nu
 
  174        //check for cuda error in our kernel
 
  175        Util::cuda_check_last_error();
 
  178      template<typename Space_, typename DT_, typename IT_>
 
  179      void assemble_defo_csr_host([[maybe_unused]] const Space_& space,
 
  180              const CSRMatrixData<DT_, IT_>& matrix_data,
 
  181              const AssemblyCubatureData<DT_>& cubature,
 
  182              const AssemblyMappingData<DT_, IT_>& dof_mapping,
 
  183              const std::vector<int*>& coloring_maps,
 
  184              const std::vector<Index>& coloring_map_sizes,
 
  188        for(Index col = 0; col < Index(coloring_maps.size()); ++col)
 
  190          VoxelAssembly::Kernel::template defo_assembler_matrix1_csr_host<Space_, DT_, IT_, FEAT::Intern::MatrixGatherScatterPolicy::useLocalSortHelper>(
 
  191            matrix_data.data, matrix_data.row_ptr, matrix_data.col_idx, matrix_data.num_rows, matrix_data.num_cols,
 
  192            (const typename Tiny::Vector<DT_, Space_::world_dim>*) cubature.cub_pt,
 
  193            cubature.cub_wg, cubature.num_cubs, alpha, dof_mapping.cell_to_dof, dof_mapping.cell_num,
 
  194            (const typename Tiny::Vector<DT_, Space_::world_dim>*) dof_mapping.nodes, dof_mapping.node_size,
 
  195            (const int*) coloring_maps[col], coloring_map_sizes[col], dof_mapping.cell_to_dof_sorter, nu
 
  205using namespace FEAT::VoxelAssembly;
 
  207template void Arch::assemble_defo_csr(const Q2StandardQuad&, const CSRMatrixData<double, std::uint32_t>&, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
 
  208                                          const std::vector<int*>&, const std::vector<Index>&, double, double);
 
  209template void Arch::assemble_defo_csr(const Q2StandardQuad&, const CSRMatrixData<float, std::uint32_t>&, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
 
  210                                          const std::vector<int*>&, const std::vector<Index>&, float, float);
 
  211template void Arch::assemble_defo_csr(const Q2StandardQuad&, const CSRMatrixData<double, std::uint64_t>&, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
 
  212                                          const std::vector<int*>&, const std::vector<Index>&, double, double);
 
  213template void Arch::assemble_defo_csr(const Q2StandardQuad&, const CSRMatrixData<float, std::uint64_t>&, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
 
  214                                          const std::vector<int*>&, const std::vector<Index>&, float, float);
 
  215// #ifdef FEAT_HAVE_HALFMATH
 
  216// template void Arch::assemble_defo_csr(const Q2StandardQuad&, const CSRMatrixData<Half, std::uint32_t>&, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
 
  217//                                           const std::vector<int*>&, const std::vector<Index>&, Half, Half);
 
  218// template void Arch::assemble_defo_csr(const Q2StandardQuad&, const CSRMatrixData<Half, std::uint64_t>&, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
 
  219//                                           const std::vector<int*>&, const std::vector<Index>&, Half, Half);
 
  222template void Arch::assemble_defo_csr_host(const Q2StandardQuad&, const CSRMatrixData<double, std::uint32_t>&, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
 
  223                                          const std::vector<int*>&, const std::vector<Index>&, double, double);
 
  224template void Arch::assemble_defo_csr_host(const Q2StandardQuad&, const CSRMatrixData<float, std::uint32_t>&, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
 
  225                                          const std::vector<int*>&, const std::vector<Index>&, float, float);
 
  226template void Arch::assemble_defo_csr_host(const Q2StandardQuad&, const CSRMatrixData<double, std::uint64_t>&, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
 
  227                                          const std::vector<int*>&, const std::vector<Index>&, double, double);
 
  228template void Arch::assemble_defo_csr_host(const Q2StandardQuad&, const CSRMatrixData<float, std::uint64_t>&, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
 
  229                                          const std::vector<int*>&, const std::vector<Index>&, float, float);
 
  232template void Arch::assemble_defo_csr(const Q2StandardHexa&, const CSRMatrixData<double, std::uint32_t>&, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
 
  233                                          const std::vector<int*>&, const std::vector<Index>&, double, double);
 
  234template void Arch::assemble_defo_csr(const Q2StandardHexa&, const CSRMatrixData<float, std::uint32_t>&, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
 
  235                                          const std::vector<int*>&, const std::vector<Index>&, float, float);
 
  236template void Arch::assemble_defo_csr(const Q2StandardHexa&, const CSRMatrixData<double, std::uint64_t>&, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
 
  237                                          const std::vector<int*>&, const std::vector<Index>&, double, double);
 
  238template void Arch::assemble_defo_csr(const Q2StandardHexa&, const CSRMatrixData<float, std::uint64_t>&, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
 
  239                                          const std::vector<int*>&, const std::vector<Index>&, float, float);
 
  240// #ifdef FEAT_HAVE_HALFMATH
 
  241// template void Arch::assemble_defo_csr(const Q2StandardHexa&, const CSRMatrixData<Half, std::uint32_t>&, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint32_t>&,
 
  242//                                           const std::vector<int*>&, const std::vector<Index>&, Half, Half);
 
  243// template void Arch::assemble_defo_csr(const Q2StandardHexa&, const CSRMatrixData<Half, std::uint64_t>&, const AssemblyCubatureData<Half>&, const AssemblyMappingData<Half, std::uint64_t>&,
 
  244//                                           const std::vector<int*>&, const std::vector<Index>&, Half, Half);
 
  247template void Arch::assemble_defo_csr_host(const Q2StandardHexa&, const CSRMatrixData<double, std::uint32_t>&, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint32_t>&,
 
  248                                          const std::vector<int*>&, const std::vector<Index>&, double, double);
 
  249template void Arch::assemble_defo_csr_host(const Q2StandardHexa&, const CSRMatrixData<float, std::uint32_t>&, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint32_t>&,
 
  250                                          const std::vector<int*>&, const std::vector<Index>&, float, float);
 
  251template void Arch::assemble_defo_csr_host(const Q2StandardHexa&, const CSRMatrixData<double, std::uint64_t>&, const AssemblyCubatureData<double>&, const AssemblyMappingData<double, std::uint64_t>&,
 
  252                                          const std::vector<int*>&, const std::vector<Index>&, double, double);
 
  253template void Arch::assemble_defo_csr_host(const Q2StandardHexa&, const CSRMatrixData<float, std::uint64_t>&, const AssemblyCubatureData<float>&, const AssemblyMappingData<float, std::uint64_t>&,
 
  254                                          const std::vector<int*>&, const std::vector<Index>&, float, float);