feat3/vector__gather__scatter__helper_8hpp_source.html

// FEAT3: Finite Element Analysis Toolbox, Version 3

// Copyright (C) 2010 by Stefan Turek & the FEAT group

// FEAT3 is released under the GNU General Public License version 3,

// see the file 'copyright.txt' in the top level directory for details.


#pragma once


#include <kernel/base_header.hpp>

#include <kernel/util/tiny_algebra.hpp>


#ifdef __CUDACC__

#include <cuda/std/type_traits>

#endif


namespace FEAT

{

  namespace LAFEM

  {

    template<typename Space_, typename DT_, typename IT_>

    struct VectorGatherScatterHelper;


    template<typename Space_, typename DT_, typename IT_>

    struct VectorGatherScatterHelper

    {

      typedef Space_ SpaceType;

      typedef DT_ DataType;

      typedef IT_ IndexType;


      template<typename InnerType_, int numr_>

      CUDA_HOST_DEVICE static void scatter_vector_dense(const Tiny::Vector<InnerType_, numr_>& loc_vec, InnerType_* data, [[maybe_unused]] IndexType num_entries, const IndexType* map, DataType alpha = DataType(1))

      {

        #ifndef __CUDACC__

        static_assert(std::is_same<typename Tiny::Intern::DataTypeExtractor<InnerType_>::MyDataType, DataType>(), "Inner Datatype does not match!");

        #else

        static_assert(::cuda::std::is_same<typename Tiny::Intern::DataTypeExtractor<InnerType_>::MyDataType, DataType>(), "Inner Datatype does not match!");

        #endif

        // loop over all local entries

        for (int i(0); i < numr_; ++i)

        {

          // get dof index

          Index dof_idx = map[i];

          // ASSERT(dof_idx < num_entries);


          // update vector entry

          Tiny::axpy(data[dof_idx], loc_vec[i], alpha);

        }

      }


      #if defined(__CUDACC__) || defined(DOXYGEN)

      template<typename ThreadGroup_, int inner_size_>

      CUDA_DEVICE static void __forceinline__ grouped_scatter_vector_dense(const ThreadGroup_& tg, const int scatter_size, const int scatter_offset, const DataType* loc_vec,

                                                            DataType* data, [[maybe_unused]] IndexType num_entries, const IndexType* map,

                                                            int data_size, DataType alpha = DataType(1))

      {

        // stride based for loop

        for(int idx = tg.thread_rank(); (idx < scatter_size * inner_size_) && ((idx + scatter_offset*inner_size_) < data_size*inner_size_); idx += tg.num_threads())

        {

          // get dof index

          Index dof_idx = map[idx/inner_size_];

          // ASSERT(dof_idx < num_entries);


          // update vector entry

          data[dof_idx*inner_size_+(idx%inner_size_)] +=  alpha * loc_vec[idx];

        }

      }


      #endif


      template<typename InnerType_, int numr_>

      CUDA_HOST_DEVICE static void gather_vector_dense(Tiny::Vector<InnerType_, numr_>& loc_vec, const InnerType_* data, [[maybe_unused]] IndexType num_entries, const IndexType* map, DataType alpha = DataType(1))

      {

        #ifndef __CUDACC__

        static_assert(std::is_same<typename Tiny::Intern::DataTypeExtractor<InnerType_>::MyDataType, DataType>(), "Inner Datatype does not match!");

        #else

        static_assert(::cuda::std::is_same<typename Tiny::Intern::DataTypeExtractor<InnerType_>::MyDataType, DataType>(), "Inner Datatype does not match!");

        #endif

        // loop over all local entries

        for (int i(0); i < numr_; ++i)

        {

          // get dof index

          Index dof_idx = map[i];

          // ASSERT(dof_idx < num_entries);


          // update local vector data

          Tiny::axpy(loc_vec[i], data[dof_idx], alpha);

        }

      }


      #if defined(__CUDACC__) || defined(DOXYGEN)

      template<typename ThreadGroup_, int inner_size_>

      CUDA_DEVICE static void __forceinline__ grouped_gather_vector_dense(const ThreadGroup_& tg, DataType* loc_vec,

                                                            const DataType* data, [[maybe_unused]] IndexType num_entries, const IndexType* map,

                                                            int num_data, DataType alpha = DataType(1))

      {

        // stride based for loop

        for(int i = tg.thread_rank(); i < num_data*inner_size_; i += tg.num_threads())

        {

          // get dof index

          Index dof_idx = map[i/inner_size_];

          // ASSERT(dof_idx < num_entries);


          // update vector entry

          loc_vec[i] += alpha * data[dof_idx*inner_size_+(i%inner_size_)];

        }

      }


      #endif

    }; //struct GPUVectorGatherScatterHelper

  }

}

base_header.hpp
FEAT Kernel base header.

FEAT::Tiny::Vector
Tiny Vector class template.
Definition: tiny_algebra.hpp:56

FEAT::Tiny::axpy
CUDA_HOST_DEVICE void axpy(T_ &y, const T_ &x, const T_ &alpha)
Performs an AXPY of two scalars.
Definition: tiny_algebra.hpp:3150

FEAT
FEAT namespace.
Definition: adjactor.hpp:12

FEAT::Index
std::uint64_t Index
Index data type.
Definition: base_header.hpp:122

FEAT::LAFEM::VectorGatherScatterHelper
Standalone Vector Gather and Scatter Axpy Interface.
Definition: vector_gather_scatter_helper.hpp:41

FEAT::LAFEM::VectorGatherScatterHelper::grouped_gather_vector_dense
static CUDA_DEVICE void __forceinline__ grouped_gather_vector_dense(const ThreadGroup_ &tg, DataType *loc_vec, const DataType *data, IndexType num_entries, const IndexType *map, int num_data, DataType alpha=DataType(1))
Dense Vector grouped scatter axpy function.
Definition: vector_gather_scatter_helper.hpp:172

FEAT::LAFEM::VectorGatherScatterHelper::gather_vector_dense
static CUDA_HOST_DEVICE void gather_vector_dense(Tiny::Vector< InnerType_, numr_ > &loc_vec, const InnerType_ *data, IndexType num_entries, const IndexType *map, DataType alpha=DataType(1))
Dense Vector gather axpy function.
Definition: vector_gather_scatter_helper.hpp:134

FEAT::LAFEM::VectorGatherScatterHelper::grouped_scatter_vector_dense
static CUDA_DEVICE void __forceinline__ grouped_scatter_vector_dense(const ThreadGroup_ &tg, const int scatter_size, const int scatter_offset, const DataType *loc_vec, DataType *data, IndexType num_entries, const IndexType *map, int data_size, DataType alpha=DataType(1))
Dense Vector grouped scatter axpy function async version.
Definition: vector_gather_scatter_helper.hpp:101

FEAT::LAFEM::VectorGatherScatterHelper::scatter_vector_dense
static CUDA_HOST_DEVICE void scatter_vector_dense(const Tiny::Vector< InnerType_, numr_ > &loc_vec, InnerType_ *data, IndexType num_entries, const IndexType *map, DataType alpha=DataType(1))
Dense Vector scatter axpy function.
Definition: vector_gather_scatter_helper.hpp:62