feat3/tiny__algebra_8hpp_source.html

// FEAT3: Finite Element Analysis Toolbox, Version 3

// Copyright (C) 2010 by Stefan Turek & the FEAT group

// FEAT3 is released under the GNU General Public License version 3,

// see the file 'copyright.txt' in the top level directory for details.


#pragma once


#include <kernel/base_header.hpp>

#include <kernel/util/assertion.hpp>

#ifndef __CUDA_ARCH__

#include <kernel/util/math.hpp>

#endif


// includes, system

#ifndef __CUDACC__

#include <initializer_list>

#else

#include <kernel/util/cuda_math.cuh>

#endif


namespace FEAT

{

  namespace Tiny

  {

    template<

      typename T_,

      int n_,

      int s_ = n_>

    class Vector DOXY({});


    template<

      typename T_,

      int m_,

      int n_,

      int sm_ = m_,

      int sn_ = n_>

    class Matrix DOXY({});


    template<

      typename T_,

      int l_,

      int m_,

      int n_,

      int sl_ = l_,

      int sm_ = m_,

      int sn_ = n_>

    class Tensor3 DOXY({});


    namespace Intern

    {

      template<typename T_>

      struct DataTypeExtractor

      {

        typedef T_ MyDataType;

        static constexpr int level = 0;

      };


      template<typename T_, int n_, int s_>

      struct DataTypeExtractor<Vector<T_, n_, s_>>

      {

        typedef typename DataTypeExtractor<T_>::MyDataType MyDataType;

        static constexpr int level = DataTypeExtractor<T_>::level+1;

      };


      // Same for Matrix

      template<typename T_, int m_, int n_, int sm_, int sn_>

      struct DataTypeExtractor<Matrix<T_, m_, n_, sm_, sn_>>

      {

        typedef typename DataTypeExtractor<T_>::MyDataType MyDataType;

        static constexpr int level = DataTypeExtractor<T_>::level+1;

      };


      // Same for Tensor3

      template<typename T_, int l_, int m_, int n_, int sl_, int sm_, int sn_>

      struct DataTypeExtractor<Tensor3<T_, l_, m_, n_, sl_, sm_, sn_>>

      {

        typedef typename DataTypeExtractor<T_>::MyDataType MyDataType;

        static constexpr int level = DataTypeExtractor<T_>::level+1;

      };


      // forward declarations of helper classes

      template<int m_, int n_>

      struct DetHelper;


      template<int m_, int n_>

      struct VolHelper;


      template<int m_, int n_>

      struct InverseHelper;


      template<int m_, int n_>

      struct CofactorHelper;


#ifdef __CUDACC__

      template<int m_, int n_>

      struct CudaGroupedInverseHelper;

#endif

    } // namespace Intern


    /* ************************************************************************************************************* */

    /* ************************************************************************************************************* */

    // Tiny Vector implementation

    /* ************************************************************************************************************* */

    /* ************************************************************************************************************* */


    template<typename T_, int n_, int s_>

    class Vector

    {

      static_assert(n_ > 0, "invalid vector length");

      static_assert(s_ >= n_, "invalid vector stride");


    public:

      static constexpr int n = n_;

      static constexpr int s = s_;


      typedef T_ ValueType;

      typedef typename Intern::DataTypeExtractor<ValueType>::MyDataType DataType;


      T_ v[s_];


      CUDA_HOST_DEVICE Vector()

      {

      }


      CUDA_HOST_DEVICE explicit Vector(DataType value)

      {

        for(int i(0); i < n_; ++i)

        {

          v[i] = value;

        }

      }


      template<int sx_>

      CUDA_HOST_DEVICE Vector(const Vector<T_, n_, sx_>& x)

      {

        for(int i(0); i < n_; ++i)

        {

          v[i] = x.v[i];

        }

      }


      template<typename Tx_, int sx_>

      CUDA_HOST_DEVICE static Vector convert_new(const Vector<Tx_, n_, sx_>& x)

      {

        Vector v;

        for(int i(0); i < n_; ++i)

        {

          if constexpr(std::is_same<T_, DataType>::value)

            v[i] = T_(x.v[i]);

          else

            v[i] = T_::convert(x.v[i]);

        }

        return v;

      }


      CUDA_HOST_DEVICE static Vector convert_new(Vector&& x)

      {

        return std::move(x);

      }


      template<typename Tx_>

      CUDA_HOST_DEVICE explicit Vector(const std::initializer_list<Tx_>& x)

      {

        XASSERTM(std::size_t(n_) == x.size(), "invalid initializer list size");

        auto it(x.begin());

        for(int i(0); i < n_; ++i, ++it)

          v[i] = T_(*it);

      }


      CUDA_HOST_DEVICE Vector& operator=(DataType value)

      {

        for(int i(0); i < n_; ++i)

        {

          v[i] = value;

        }

        return *this;

      }


      template<int sx_>

      CUDA_HOST_DEVICE Vector& operator=(const Vector<T_, n_, sx_>& x)

      {

        for(int i(0); i < n_; ++i)

        {

          v[i] = x.v[i];

        }

        return *this;

      }


      template<typename Tx_>

      CUDA_HOST_DEVICE Vector& operator=(const std::initializer_list<Tx_>& x)

      {

        XASSERTM(std::size_t(n_) == x.size(), "invalid initializer list size");

        auto it(x.begin());

        for(int i(0); i < n_; ++i, ++it)

          v[i] = T_(*it);

        return *this;

      }


      template<typename Tx_, int sx_>

      CUDA_HOST_DEVICE void convert(const Vector<Tx_, n_, sx_>& x)

      {

        for(int i(0); i < n_; ++i)

          v[i] = T_(x.v[i]);

      }


      CUDA_HOST_DEVICE T_& operator()(int i)

      {

        ASSERTM((i >= 0) && (i < n_), "index i out-of-bounds");

        return v[i];

      }


      CUDA_HOST_DEVICE const T_& operator()(int i) const

      {

        ASSERTM((i >= 0) && (i < n_), "index i out-of-bounds");

        return v[i];

      }


      CUDA_HOST_DEVICE T_& operator[](int i)

      {

        ASSERTM((i >= 0) && (i < n_), "index i out-of-bounds");

        return v[i];

      }


      CUDA_HOST_DEVICE const T_& operator[](int i) const

      {

        ASSERTM((i >= 0) && (i < n_), "index i out-of-bounds");

        return v[i];

      }


      template<int snx_>

      CUDA_HOST_DEVICE void copy(const Vector<T_, n_, snx_>& x)

      {

        for(int i(0); i < n_; ++i)

        {

          v[i] = x.v[i];

        }

      }


      template<int nn_, int nx_, int snx_>

      CUDA_HOST_DEVICE void copy_n(const Vector<T_, nx_, snx_>& x)

      {

        static_assert(nn_ <= n_, "invalid copy_n size");

        static_assert(nn_ <= nx_, "invalid copy_n size");

        for(int i(0); i < nn_; ++i)

        {

          v[i] = x.v[i];

        }

      }


      CUDA_HOST_DEVICE Vector& operator*=(DataType alpha)

      {

        for(int i(0); i < n_; ++i)

        {

          v[i] *= alpha;

        }

        return *this;

      }


      template <int sx_>

      CUDA_HOST_DEVICE Vector& operator*=(const Vector<T_, n_, sx_>& x)

      {

        for(int i(0); i < n_; ++i)

        {

          v[i] *= x.v[i];

        }

        return *this;

      }


      template<int sx_>

      CUDA_HOST_DEVICE Vector& operator+=(const Vector<T_, n_, sx_>& x)

      {

        for(int i(0); i < n_; ++i)

        {

          v[i] += x.v[i];

        }

        return *this;

      }


      template<int sx_>

      CUDA_HOST_DEVICE Vector& operator-=(const Vector<T_, n_, sx_>& x)

      {

        for(int i(0); i < n_; ++i)

        {

          v[i] -= x.v[i];

        }

        return *this;

      }


      CUDA_HOST_DEVICE void format(DataType alpha = DataType(0))

      {

        for(int i(0); i < n_; ++i)

        {

          v[i] = alpha;

        }

      }


      template<int nn_>

      CUDA_HOST_DEVICE void format_n(DataType alpha = DataType(0))

      {

        static_assert(nn_ <= n_, "invalid format_n size");

        for(int i(0); i < nn_; ++i)

        {

          v[i] = alpha;

        }

      }


      CUDA_HOST_DEVICE void scale(DataType alpha)

      {

        for(int i(0); i < n_; ++i)

        {

          v[i] *= alpha;

        }

      }


      template<int nn_>

      CUDA_HOST_DEVICE void scale_n(DataType alpha)

      {

        static_assert(nn_ <= n_, "invalid scale_n size");

        for(int i(0); i < nn_; ++i)

        {

          v[i] *= alpha;

        }

      }


      CUDA_HOST_DEVICE Vector& normalize()

      {

      #ifndef __CUDACC__

        const DataType norm2(this->norm_euclid());

        ASSERTM(norm2 > Math::eps<DataType>(), "Trying to normalize a null vector!");

        return ((*this) *= (DataType(1)/norm2));

      #else

        const DataType norm2_sqr(this->norm_euclid_sqr());

        ASSERTM(norm2_sqr > CudaMath::cuda_get_eps<DataType>(), "Trying to normalize a null vector!");

        return ((*this) *= CudaMath::cuda_rsqrt(norm2_sqr));

      #endif

      }


      template<int nn_>

      CUDA_HOST_DEVICE Vector& normalize_n()

      {

        static_assert(nn_ <= n_, "invalid normalize_n size");

#ifndef __CUDACC__

        const DataType norm2(this->template norm_euclid_n<nn_>());

        ASSERTM(norm2 > Math::eps<DataType>(), "Trying to normalize a null vector!");

        this->template scale_n<nn_>(DataType(1)/norm2);

        return *this;

#else

        const DataType norm2_sqr(this->template norm_euclid_sqr_n<nn_>());

        ASSERTM(norm2_sqr > CudaMath::cuda_get_eps<DataType>(), "Trying to normalize a null vector!");

        this->template scale_n<nn_>(CudaMath::cuda_rsqrt(norm2_sqr));

        return *this;

#endif

      }


      CUDA_HOST_DEVICE Vector& negate()

      {

        for(int i(0); i < n_; ++i)

          v[i] = -v[i];

        return *this;

      }


      template<int nn_>

      CUDA_HOST_DEVICE Vector& negate_n()

      {

        static_assert(nn_ <= n_, "invalid negate_n size");

        for(int i(0); i < nn_; ++i)

          v[i] = -v[i];

        return *this;

      }


      template<int snx_>

      CUDA_HOST_DEVICE Vector& axpy(DataType alpha, const Vector<T_, n_, snx_>& x)

      {

        for(int i(0); i < n_; ++i)

          v[i] += alpha * x.v[i];

        return *this;

      }


      template<int nn_, int nx_, int snx_>

      CUDA_HOST_DEVICE Vector& axpy_n(DataType alpha, const Vector<T_, nx_, snx_>& x)

      {

        static_assert(nn_ <= n_, "invalid negate_n size");

        static_assert(nn_ <= nx_, "invalid negate_n size");

        for(int i(0); i < nn_; ++i)

          v[i] += alpha * x.v[i];

        return *this;

      }


      template<int sna_, int snb_>

      CUDA_HOST_DEVICE Vector& set_convex(DataType alpha, const Vector<T_, n_, sna_>& a, const Vector<T_, n_, snb_>& b)

      {

        for(int i(0); i < n_; ++i)

          v[i] = (T_(1) - alpha) * a.v[i] + alpha * b.v[i];

        return *this;

      }


      template<int nn_, int na_, int nb_, int sna_, int snb_>

      CUDA_HOST_DEVICE Vector& set_convex_n(DataType alpha, const Vector<T_, na_, sna_>& a, const Vector<T_, nb_, snb_>& b)

      {

        static_assert(nn_ <= n_, "invalid set_convex_n size");

        static_assert(nn_ <= na_, "invalid set_convex_n size");

        static_assert(nn_ <= nb_, "invalid set_convex_n size");

        for(int i(0); i < nn_; ++i)

          v[i] = (T_(1) - alpha) * a.v[i] + alpha * b.v[i];

        return *this;

      }


      template<int m_, int sma_, int sna_, int sx_>

      CUDA_HOST_DEVICE Vector& set_mat_vec_mult(const Matrix<T_, n_, m_, sma_, sna_>& a, const Vector<T_, m_, sx_>& x)

      {

        // we have to compare void* addresses here, because we might get a type mismatch error otherwise

        ASSERTM((const void*)this != (const void*)&x, "result vector and multiplicand vector 'x' must be different objects");


        for(int i(0); i < n_; ++i)

        {

          v[i] = T_(0);

          for(int j(0); j < m_; ++j)

          {

            v[i] += a.v[i][j] * x.v[j];

          }

        }

        return *this;

      }


      template<int mm_, int nn_, int ma_, int na_, int sna_, int sma_, int nx_, int sx_>

      CUDA_HOST_DEVICE Vector& set_mat_vec_mult_n(const Matrix<T_, ma_, na_, sma_, sna_>& a, const Vector<T_, nx_, sx_>& x)

      {

        static_assert(mm_ <= n_, "invalid set_mat_vec_mult_n size");

        static_assert(mm_ <= ma_, "invalid set_mat_vec_mult_n size");

        static_assert(nn_ <= nx_, "invalid set_mat_vec_mult_n size");

        static_assert(nn_ <= na_, "invalid set_mat_vec_mult_n size");


        // we have to compare void* addresses here, because we might get a type mismatch error otherwise

        ASSERTM((const void*)this != (const void*)&x, "result vector and multiplicand vector 'x' must be different objects");


        for(int i(0); i < mm_; ++i)

        {

          v[i] = T_(0);

          for(int j(0); j < nn_; ++j)

          {

            v[i] += a.v[i][j] * x.v[j];

          }

        }

        return *this;

      }


      template<int m_, int sma_, int sna_, int sx_>

      CUDA_HOST_DEVICE Vector& set_vec_mat_mult(const Vector<T_, m_, sx_>& x, const Matrix<T_, m_, n_, sma_, sna_>& a)

      {

        // we have to compare void* addresses here, because we might get a type mismatch error otherwise

        ASSERTM((const void*)this != (const void*)&x, "result vector and multiplicand vector 'x' must be different objects");


        for(int j(0); j < n_; ++j)

        {

          v[j] = T_(0);

          for(int i(0); i < m_; ++i)

          {

            v[j] += a.v[i][j] * x.v[i];

          }

        }

        return *this;

      }


      template<int nn_, int mm_, int mx_, int smx_, int ma_, int na_, int sma_, int sna_>

      CUDA_HOST_DEVICE Vector& set_vec_mat_mult_n(const Vector<T_, mx_, smx_>& x, const Matrix<T_, ma_, na_, sma_, sna_>& a)

      {

        static_assert(mm_ <= mx_, "invalid set_mat_vec_mult_n size");

        static_assert(mm_ <= ma_, "invalid set_mat_vec_mult_n size");

        static_assert(nn_ <= n_, "invalid set_mat_vec_mult_n size");

        static_assert(nn_ <= na_, "invalid set_mat_vec_mult_n size");


        // we have to compare void* addresses here, because we might get a type mismatch error otherwise

        ASSERTM((const void*)this != (const void*)&x, "result vector and multiplicand vector 'x' must be different objects");


        for(int j(0); j < nn_; ++j)

        {

          v[j] = T_(0);

          for(int i(0); i < mm_; ++i)

          {

            v[j] += a.v[i][j] * x.v[i];

          }

        }

        return *this;

      }


      template<int m_, int sma_, int sna_, int sx_>

      CUDA_HOST_DEVICE Vector& add_mat_vec_mult(const Matrix<T_, n_, m_, sma_, sna_>& a, const Vector<T_, m_, sx_>& x, DataType alpha = DataType(1))

      {

        // we have to compare void* addresses here, because we might get a type mismatch error otherwise

        ASSERTM((const void*)this != (const void*)&x, "result vector and multiplicand vector 'x' must be different objects");


        for(int i(0); i < n_; ++i)

        {

          for(int j(0); j < m_; ++j)

          {

            v[i] += alpha * a.v[i][j] * x.v[j];

          }

        }

        return *this;

      }


      template<int mm_, int nn_, int ma_, int na_, int sna_, int sma_, int nx_, int sx_>

      CUDA_HOST_DEVICE Vector& add_mat_vec_mult_n(const Matrix<T_, ma_, na_, sma_, sna_>& a, const Vector<T_, nx_, sx_>& x, DataType alpha = DataType(1))

      {

        static_assert(mm_ <= n_, "invalid add_mat_vec_mult_n size");

        static_assert(mm_ <= ma_, "invalid add_mat_vec_mult_n size");

        static_assert(nn_ <= nx_, "invalid add_mat_vec_mult_n size");

        static_assert(nn_ <= na_, "invalid add_mat_vec_mult_n size");


        // we have to compare void* addresses here, because we might get a type mismatch error otherwise

        ASSERTM((const void*)this != (const void*)&x, "result vector and multiplicand vector 'x' must be different objects");


        for(int i(0); i < mm_; ++i)

        {

          for(int j(0); j < nn_; ++j)

          {

            v[i] += alpha * a.v[i][j] * x.v[j];

          }

        }

        return *this;

      }


      template<int m_, int sma_, int sna_, int sx_>

      CUDA_HOST_DEVICE Vector& add_vec_mat_mult(const Vector<T_, m_, sx_>& x, const Matrix<T_, m_, n_, sma_, sna_>& a, DataType alpha = DataType(1))

      {

        // we have to compare void* addresses here, because we might get a type mismatch error otherwise

        ASSERTM((const void*)this != (const void*)&x, "result vector and multiplicand vector 'x' must be different objects");


        for(int j(0); j < n_; ++j)

        {

          for(int i(0); i < m_; ++i)

          {

            v[j] += alpha * a.v[i][j] * x.v[i];

          }

        }

        return *this;

      }


      template<int nn_, int mm_, int mx_, int smx_, int ma_, int na_, int sma_, int sna_>

      CUDA_HOST_DEVICE Vector& add_vec_mat_mult_n(const Vector<T_, mx_, smx_>& x, const Matrix<T_, ma_, na_, sma_, sna_>& a, DataType alpha = DataType(1))

      {

        static_assert(mm_ <= mx_, "invalid add_vec_mat_mult_n size");

        static_assert(mm_ <= ma_, "invalid add_vec_mat_mult_n size");

        static_assert(nn_ <= n_, "invalid add_vec_mat_mult_n size");

        static_assert(nn_ <= na_, "invalid add_vec_mat_mult_n size");


        // we have to compare void* addresses here, because we might get a type mismatch error otherwise

        ASSERTM((const void*)this != (const void*)&x, "result vector and multiplicand vector 'x' must be different objects");


        for(int j(0); j < nn_; ++j)

        {

          for(int i(0); i < mm_; ++i)

          {

            v[j] += alpha * a.v[i][j] * x.v[i];

          }

        }

        return *this;

      }


      CUDA_HOST_DEVICE DataType norm_euclid_sqr() const

      {

        DataType r(DataType(0));

        for(int i(0); i < n_; ++i)

        {

          #ifndef __CUDACC__

          r += Math::sqr(v[i]);

          #else

          r += CudaMath::cuda_sqr(v[i]);

          #endif

        }

        return r;

      }


      template<int nn_>

      CUDA_HOST_DEVICE DataType norm_euclid_sqr_n() const

      {

        static_assert(nn_ <= n_, "invalid norm_euclid_sqr_n size");

        DataType r(DataType(0));

        for(int i(0); i < nn_; ++i)

        {

          #ifndef __CUDACC__

          r += Math::sqr(v[i]);

          #else

          r += CudaMath::cuda_sqr(v[i]);

          #endif

        }

        return r;

      }


      CUDA_HOST_DEVICE DataType norm_euclid() const

      {

        #ifndef __CUDACC__

        return Math::sqrt(norm_euclid_sqr());

        #else

        return CudaMath::cuda_sqrt(norm_euclid_sqr());

        #endif

      }


      template<int nn_>

      CUDA_HOST_DEVICE DataType norm_euclid_n() const

      {

        static_assert(nn_ <= n_, "invalid norm_euclid_n size");

        #ifndef __CUDACC__

        return Math::sqrt(this->template norm_euclid_sqr_n<nn_>());

        #else

        return CudaMath::cuda_sqrt(this->template norm_euclid_sqr_n<nn_>());

        #endif

      }


      CUDA_HOST_DEVICE DataType norm_l1() const

      {

        DataType r(DataType(0));

        for(int i(0); i < n_; ++i)

        {

          #ifndef __CUDACC__

          r += Math::abs(v[i]);

          #else

          r += CudaMath::cuda_abs(v[i]);

          #endif

        }

        return r;

      }


      template<int nn_>

      CUDA_HOST_DEVICE DataType norm_l1_n() const

      {

        static_assert(nn_ <= n_, "invalid norm_l1_n size");

        DataType r(DataType(0));

        for(int i(0); i < nn_; ++i)

        {

          #ifndef __CUDACC__

          r += Math::abs(v[i]);

          #else

          r += CudaMath::cuda_abs(v[i]);

          #endif

        }

        return r;

      }


      CUDA_HOST_DEVICE DataType norm_max() const

      {

        DataType r(DataType(0));

        for(int i(0); i < n_; ++i)

        {

          #ifndef __CUDACC__

          r = Math::max(r, Math::abs(v[i]));

          #else

          r = CudaMath::cuda_max(r, CudaMath::cuda_abs(v[i]));

          #endif

        }

        return r;

      }


      template<int nn_>

      CUDA_HOST_DEVICE DataType norm_max_n() const

      {

        static_assert(nn_ <= n_, "invalid norm_l1_n size");

        DataType r(DataType(0));

        for(int i(0); i < nn_; ++i)

        {

          #ifndef __CUDACC__

          r = Math::max(r, Math::abs(v[i]));

          #else

          r = CudaMath::cuda_max(r, CudaMath::cuda_abs(v[i]));

          #endif

        }

        return r;

      }


      CUDA_HOST_DEVICE static Vector null()

      {

        return Vector(DataType(0));

      }


      CUDA_HOST_DEVICE bool normalized() const

      {

        return Math::abs(DataType(1) - norm_euclid()) < Math::pow(Math::Limits<DataType>::epsilon(), DataType(0.9));

      }


      CUDA_HOST friend std::ostream & operator<< (std::ostream & lhs, const Vector & b)

      {

        lhs << "[";

        for (int i(0) ; i < b.n ; ++i)

        {

          lhs << "  " << stringify(b(i));

        }

        lhs << "]";


        return lhs;

      }

    }; // class Vector


    template<typename T_, int sx_, int sa_>

    inline void cross(Vector<T_, 2, sx_>& x, const Vector<T_, 2, sa_>& a)

    {

      x.v[0] =  a.v[1];

      x.v[1] = -a.v[0];

    }


    template<typename T_, int sx_, int sa_, int sb_>

    inline void cross(Vector<T_, 3, sx_>& x, const Vector<T_, 3, sa_>& a, const Vector<T_, 3, sb_>& b)

    {

      x.v[0] = a.v[1]*b.v[2] - a.v[2]*b.v[1];

      x.v[1] = a.v[2]*b.v[0] - a.v[0]*b.v[2];

      x.v[2] = a.v[0]*b.v[1] - a.v[1]*b.v[0];

    }


    template<typename T_, int n_, int s_>

    CUDA_HOST_DEVICE inline Vector<T_, n_> operator*(typename Vector<T_, n_>::DataType alpha, const Vector<T_, n_, s_>& x)

    {

      return Vector<T_, n_>(x) *= alpha;

    }


    template<typename T_, int n_, int s_>

    CUDA_HOST_DEVICE inline Vector<T_, n_> operator*(const Vector<T_, n_, s_>& x, typename Vector<T_, n_>::DataType alpha)

    {

      return Vector<T_, n_>(x) *= alpha;

    }


    template<typename T_, int n_, int sa_, int sb_>

    CUDA_HOST_DEVICE inline Vector<T_, n_> component_product(const Vector<T_, n_, sa_>& a, const Vector<T_, n_, sb_>& b)

    {

      return Vector<T_, n_>(a) *= b;

    }


    template<typename T_, int n_, int sa_, int sb_>

    CUDA_HOST_DEVICE inline Vector<T_, n_> operator+(const Vector<T_, n_, sa_>& a, const Vector<T_, n_, sb_>& b)

    {

      return Vector<T_, n_>(a) += b;

    }


    template<typename T_, int n_, int sa_, int sb_>

    CUDA_HOST_DEVICE inline Vector<T_, n_> operator-(const Vector<T_, n_, sa_>& a, const Vector<T_, n_, sb_>& b)

    {

      return Vector<T_, n_>(a) -= b;

    }


    template<typename T_>

    CUDA_HOST inline T_ calculate_opening_angle(const Vector<T_,2>& x, const Vector<T_, 2>& y)

    {

      #ifdef __CUDACC__

      XABORTM("calculate_opening_angle not implemented for CUDA");

      return T_(0);

      #else

      return Math::calc_opening_angle(x[0], x[1], y[0], y[1]);

      #endif

    }


    template<typename T_, int dim_>

    CUDA_HOST inline Vector<T_, dim_> project_onto(const Vector<T_, dim_>& x, const Vector<T_, dim_>& y)

    {

      T_ norm2(y.template norm_euclid_n<dim_>());

      if(norm2 < Math::eps<T_>())

        norm2 = T_(1);

      const auto tmp_normalized = (T_(1)/norm2) * y;

      return dot(x, tmp_normalized) * tmp_normalized;

    }


    /* ************************************************************************************************************* */

    /* ************************************************************************************************************* */

    // Tiny Matrix implementation

    /* ************************************************************************************************************* */

    /* ************************************************************************************************************* */


    template<typename T_, int m_, int n_, int sm_, int sn_>

    class Matrix

    {

      static_assert(m_ > 0, "invalid row count");

      static_assert(n_ > 0, "invalid column count");

      static_assert(sm_ >= m_, "invalid row stride");

      static_assert(sn_ >= n_, "invalid column stride");


    public:

      static constexpr int m = m_;

      static constexpr int n = n_;

      static constexpr int sm = sm_;

      static constexpr int sn = sn_;


      typedef T_ ValueType;

      typedef typename Intern::DataTypeExtractor<ValueType>::MyDataType DataType;


      typedef Vector<T_, n_, sn_> RowType;

      RowType v[sm_];


      CUDA_HOST_DEVICE Matrix()

      {

      }


      CUDA_HOST_DEVICE explicit Matrix(DataType value)

      {

        for(int i(0); i < m_; ++i)

        {

          v[i] = value;

        }

      }


      template<typename T2_, int sma_, int sna_>

      CUDA_HOST_DEVICE Matrix(const Matrix<T2_, m_, n_, sma_, sna_>& a)

      {

        for(int i(0); i < m_; ++i)

        {

          for(int j(0); j < n_; ++j)

          {

            v[i][j] = T_(a.v[i][j]);

          }

        }

      }


      template<typename Tx_>

      CUDA_HOST_DEVICE explicit Matrix(const std::initializer_list<Tx_>& x)

      {

        XASSERTM(std::size_t(m_) == x.size(), "invalid initializer list size");

        auto it(x.begin());

        for(int i(0); i < m_; ++i, ++it)

          v[i] = *it;

      }


      template<typename Tx_>

      CUDA_HOST_DEVICE explicit Matrix(const std::initializer_list<std::initializer_list<Tx_>>& x)

      {

        XASSERTM(std::size_t(m_) == x.size(), "invalid initializer list size");

        auto it(x.begin());

        for(int i(0); i < m_; ++i, ++it)

          v[i] = *it;

      }


      CUDA_HOST_DEVICE Matrix& operator=(DataType value)

      {

        for(int i(0); i < m_; ++i)

        {

          v[i] = value;

        }

        return *this;

      }


      template<int sma_, int sna_>

      CUDA_HOST_DEVICE Matrix& operator=(const Matrix<T_, m_, n_, sma_, sna_>& a)

      {

        for(int i(0); i < m_; ++i)

        {

          v[i] = a.v[i];

        }

        return *this;

      }


      template<typename Tx_>

      CUDA_HOST_DEVICE Matrix& operator=(const std::initializer_list<Tx_>& x)

      {

        XASSERTM(std::size_t(m_) == x.size(), "invalid initializer list size");

        auto it(x.begin());

        for(int i(0); i < m_; ++i, ++it)

          v[i] = *it;

        return *this;

      }


      template<typename Tx_>

      CUDA_HOST_DEVICE Matrix& operator=(const std::initializer_list<std::initializer_list<Tx_>>& x)

      {

        XASSERTM(std::size_t(m_) == x.size(), "invalid initializer list size");

        auto it(x.begin());

        for(int i(0); i < m_; ++i, ++it)

          v[i] = *it;

        return *this;

      }


      template<typename Tx_, int sma_, int sna_>

      CUDA_HOST_DEVICE void convert(const Matrix<Tx_, m_, n_, sma_, sna_>& a)

      {

        for(int i(0); i < m_; ++i)

          v[i].convert(a.v[i]);

      }


      CUDA_HOST_DEVICE T_& operator()(int i, int j)

      {

        ASSERTM( (i >= 0) && (i < m_), "index i out-of-bounds");

        ASSERTM( (j >= 0) && (j < n_), "index j out-of-bounds");

        return v[i][j];

      }


      CUDA_HOST_DEVICE const T_& operator()(int i, int j) const

      {

        ASSERTM( (i >= 0) && (i < m_), "index i out-of-bounds");

        ASSERTM( (j >= 0) && (j < n_), "index j out-of-bounds");

        return v[i][j];

      }


      CUDA_HOST_DEVICE RowType& operator[](int i)

      {

        ASSERTM( (i >= 0) && (i <m_), "index i out-of-bounds");

        return v[i];

      }


      CUDA_HOST_DEVICE const RowType& operator[](int i) const

      {

        ASSERTM( (i >= 0) && (i <m_), "index i out-of-bounds");

        return v[i];

      }


      CUDA_HOST_DEVICE Matrix& operator*=(DataType alpha)

      {

        for(int i(0); i < m_; ++i)

        {

          v[i] *= alpha;

        }

        return *this;

      }


      template<int sma_, int sna_>

      CUDA_HOST_DEVICE Matrix& operator+=(const Matrix<T_, m_, n_, sma_, sna_>& a)

      {

        for(int i(0); i < m_; ++i)

        {

          v[i] += a.v[i];

        }

        return *this;

      }


      template<int sma_, int sna_>

      CUDA_HOST_DEVICE Matrix& operator-=(const Matrix<T_, m_, n_, sma_, sna_>& a)

      {

        for(int i(0); i < m_; ++i)

        {

          v[i] -= a.v[i];

        }

        return *this;

      }


      template<int sma_, int sna_>

      CUDA_HOST_DEVICE void copy(const Matrix<T_, m_, n_, sma_, sna_>& a)

      {

        for(int i(0); i < m_; ++i)

          for(int j(0); j < n_; ++j)

            v[i][j] = a.v[i][j];

      }


      template<int mm_, int nn_, int ma_, int na_, int sma_, int sna_>

      CUDA_HOST_DEVICE void copy_n(const Matrix<T_, ma_, na_, sma_, sna_>& a)

      {

        static_assert(mm_ <= m_, "invalid copy_n size");

        static_assert(mm_ <= ma_, "invalid copy_n size");

        static_assert(nn_ <= n_, "invalid copy_n size");

        static_assert(nn_ <= na_, "invalid copy_n size");

        for(int i(0); i < mm_; ++i)

          for(int j(0); j < nn_; ++j)

            v[i][j] = a.v[i][j];

      }


      CUDA_HOST_DEVICE void format(DataType alpha = DataType(0))

      {

        for(int i(0); i < m_; ++i)

        {

          v[i].format(alpha);

        }

      }


      CUDA_HOST_DEVICE DataType norm_hessian_sqr() const

      {

        DataType r(0);

        for(int i(0); i < m_; ++i)

        {

          #ifndef __CUDACC__

          r += Math::sqr(v[i][i]);

          #else

          r += CudaMath::cuda_sqr(v[i][i]);

          #endif

          for(int j(0); j < n_; ++j)

          {

            #ifndef __CUDACC__

            r += Math::sqr(v[i][j]);

            #else

            r += CudaMath::cuda_sqr(v[i][j]);

            #endif

          }

        }

        return r / DataType(2);

      }


      CUDA_HOST_DEVICE DataType norm_frobenius() const

      {

        #ifndef __CUDACC__

        return Math::sqrt(norm_frobenius_sqr());

        #else

        return CudaMath::cuda_sqrt(norm_frobenius_sqr());

        #endif

      }


      CUDA_HOST_DEVICE DataType norm_frobenius_sqr() const

      {

        DataType r(0);

        for(int i(0); i < m_; ++i)

        {

          for(int j(0); j < n_; ++j)

          {

            #ifndef __CUDACC__

            r += Math::sqr(v[i][j]);

            #else

            r += CudaMath::cuda_sqr(v[i][j]);

            #endif

          }

        }

        return r;

      }


      CUDA_HOST_DEVICE DataType norm_sub_id_frobenius() const

      {

        DataType r(0);

        for(int i(0); i < m_; ++i)

        {

          for(int j(0); j < n_; ++j)

          {

            #ifndef __CUDACC__

            r += Math::sqr(v[i][j] - DataType(i == j ? 1 : 0));

            #else

            r += CudaMath::cuda_sqr(v[i][j] - DataType(i == j ? 1 : 0));

            #endif

          }

        }

        #ifndef __CUDACC__

        return Math::sqrt(r);

        #else

        return CudaMath::cuda_sqrt(r);

        #endif

      }


      CUDA_HOST_DEVICE DataType trace() const

      {

        #ifndef __CUDACC__

        int k = Math::min(m_, n_);

        #else

        int k = CudaMath::cuda_min(m_, n_);

        #endif

        DataType r(0);

        for(int i(0); i < k; ++i)

        {

          r += v[i][i];

        }

        return r;

      }


      CUDA_HOST_DEVICE DataType det() const

      {

        return Intern::DetHelper<m_, n_>::compute(*this);

      }


      CUDA_HOST_DEVICE DataType vol() const

      {

        return Intern::VolHelper<m_, n_>::compute(*this);

      }


      template<int sma_, int sna_>

      CUDA_HOST_DEVICE Matrix& set_inverse(const Matrix<T_, m_, n_, sma_, sna_>& a)

      {

        // we have to compare void* addresses here, because we might get a type mismatch error otherwise

        ASSERTM((const void*)this != (const void*)&a, "result matrix and input matrix 'a' must be different objects");

        Intern::InverseHelper<m_, n_>::compute(*this, a);

        return *this;

      }


      #ifdef __CUDACC__

      template<typename ThreadGroup_, int sma_, int sna_>

      CUDA_HOST_DEVICE __forceinline__ Matrix& grouped_set_inverse(const ThreadGroup_& tg, const Matrix<T_, m_, n_, sma_, sna_>& a, const T_& det)

      {

        // we have to compare void* addresses here, because we might get a type mismatch error otherwise

        ASSERTM((const void*)this != (const void*)&a, "result matrix and input matrix 'a' must be different objects");

        Intern::CudaGroupedInverseHelper<m_, n_>::compute(tg, *this, a, det);

        return *this;

      }

      #endif


      template<int sma_, int sna_>

      CUDA_HOST_DEVICE Matrix& set_cofactor(const Matrix<T_, m_, n_, sma_, sna_>& a)

      {

        // we have to compare void* addresses here, because we might get a type mismatch error otherwise

        ASSERTM((const void*)this != (const void*)&a, "result matrix and input matrix 'a' must be different objects");

        Intern::CofactorHelper<m_, n_>::compute(*this, a);

        return *this;

      }


      template<int sma_, int sna_>

      CUDA_HOST_DEVICE Matrix& set_transpose(const Matrix<T_, n_, m_, sma_, sna_>& a)

      {

        // we have to compare void* addresses here, because we might get a type mismatch error otherwise

        ASSERTM((const void*)this != (const void*)&a, "result matrix and input matrix 'a' must be different objects");


        for(int i(0); i < m_; ++i)

        {

          for(int j(0); j < n_; ++j)

          {

            v[i][j] = a.v[j][i];

          }

        }

        return *this;

      }


      template<int l_, int sla_, int sna_>

      CUDA_HOST_DEVICE Matrix& set_gram(const Matrix<T_, l_, n_, sla_, sna_>& a)

      {

        static_assert(m_ == n_, "Gram matrices must be square");


        format();


        for(int k(0); k < l_; ++k)

        {

          for(int i(0); i < n_; ++i)

          {

            for(int j(0); j < n_; ++j)

            {

              v[i][j] += a.v[k][i] * a.v[k][j];

            }

          }

        }


        return *this;

      }


      template<int snx_, int sny_>

      CUDA_HOST_DEVICE DataType scalar_product(const Vector<T_, m_, snx_>& x, const Vector<T_, n_, sny_>& y) const

      {

        DataType r(DataType(0));

        for(int i(0); i < m_; ++i)

        {

          r += x[i] * dot(v[i], y);

        }

        return r;

      }


      template<int snx_, int sny_>

      CUDA_HOST_DEVICE Matrix& add_outer_product(

        const Vector<T_, m_, snx_>& x,

        const Vector<T_, n_, sny_>& y,

        const DataType alpha = DataType(1))

      {

        for(int i(0); i < m_; ++i)

        {

          for(int j(0); j < n_; ++j)

          {

            v[i][j] += alpha * x[i] * y[j];

          }

        }

        return *this;

      }


      template<int snx_, int sny_>

      CUDA_HOST_DEVICE Matrix& set_outer_product(

        const Vector<T_, m_, snx_>& x,

        const Vector<T_, n_, sny_>& y)

      {

        for(int i(0); i < m_; ++i)

        {

          for(int j(0); j < n_; ++j)

          {

            v[i][j] = x[i] * y[j];

          }

        }

        return *this;

      }


      template<int sma_, int sna_>

      CUDA_HOST_DEVICE Matrix& axpy(DataType alpha, const Matrix<T_, m_, n_, sma_, sna_>& a)

      {

        for(int i(0); i < m_; ++i)

        {

          for(int j(0); j < n_; ++j)

          {

            v[i][j] += alpha * a.v[i][j];

          }

        }

        return *this;

      }


      CUDA_HOST_DEVICE Matrix& add_scalar_main_diag(DataType alpha)

      {

        for(int i(0); (i < m_) && (i < n_); ++i)

          v[i][i] += alpha;

        return *this;

      }


      template<int la_, int lb_, int sma_, int sna_, int smb_, int snb_>

      CUDA_HOST_DEVICE Matrix& add_mat_mat_mult(

        const Matrix<T_, m_, la_, sma_, sna_>& a,

        const Matrix<T_, lb_, n_, smb_, snb_>& b,

        DataType alpha = DataType(1))

      {

        // we have to compare void* addresses here, because we might get a type mismatch error otherwise

        ASSERTM((const void*)this != (const void*)&a, "result matrix and multiplicand matrix 'a' must be different objects");

        ASSERTM((const void*)this != (const void*)&b, "result matrix and multiplicand matrix 'b' must be different objects");

        ASSERTM(la_ == lb_, "second dimension of a must be equal to first dimension of b");


        for(int i(0); i < m_; ++i)

        {

          for(int j(0); j < n_; ++j)

          {

            DataType r(0);

            for(int k(0); k < la_; ++k)

            {

              r += a.v[i][k] * b.v[k][j];

            }

            v[i][j] += alpha * r;

          }

        }

        return *this;

      }


      template<int la_, int lb_, int sma_, int sna_, int smb_, int snb_>

      CUDA_HOST_DEVICE Matrix& set_mat_mat_mult(const Matrix<T_, m_, la_, sma_, sna_>& a, const Matrix<T_, lb_, n_, smb_, snb_>& b)

      {

        // we have to compare void* addresses here, because we might get a type mismatch error otherwise

        ASSERTM((const void*)this != (const void*)&a, "result matrix and multiplicand matrix 'a' must be different objects");

        ASSERTM((const void*)this != (const void*)&b, "result matrix and multiplicand matrix 'b' must be different objects");

        ASSERTM(la_ == lb_, "second dimension of a must be equal to first dimension of b");


        format();

        return add_mat_mat_mult(a, b);

      }


      template<int k_, int l_, int sma_, int sna_, int smb_, int snb_, int smd_, int snd_>

      CUDA_HOST_DEVICE Matrix& add_double_mat_mult(

        const Matrix<T_, k_, l_, sma_, sna_>& a,

        const Matrix<T_, k_, m_, smb_, snb_>& b,

        const Matrix<T_, l_, n_, smd_, snd_>& d,

        DataType alpha = DataType(1))

      {

        // we have to compare void* addresses here, because we might get a type mismatch error otherwise

        ASSERTM((const void*)this != (const void*)&a, "result matrix and multiplicand matrix 'a' must be different objects");

        ASSERTM((const void*)this != (const void*)&b, "result matrix and multiplicand matrix 'b' must be different objects");

        ASSERTM((const void*)this != (const void*)&d, "result matrix and multiplicand matrix 'd' must be different objects");


        for(int i(0); i < m_; ++i)

        {

          for(int j(0); j < n_; ++j)

          {

            DataType r(0);

            for(int p(0); p < k_; ++p)

            {

              DataType t(0);

              for(int q(0); q < l_; ++q)

              {

                t += a(p,q) * d(q,j);

              }

              r += b(p,i)*t;

            }

            v[i][j] += alpha * r;

          }

        }

        return *this;

      }


      template<int k_, int l_, int sma_, int sna_, int smb_, int snb_, int smd_, int snd_>

      CUDA_HOST_DEVICE Matrix& set_double_mat_mult(

        const Matrix<T_, k_, l_, sma_, sna_>& a,

        const Matrix<T_, k_, m_, smb_, snb_>& b,

        const Matrix<T_, l_, n_, smd_, snd_>& d,

        T_ alpha = T_(1))

      {

        // we have to compare void* addresses here, because we might get a type mismatch error otherwise

        ASSERTM((const void*)this != (const void*)&a, "result matrix and multiplicand matrix 'a' must be different objects");

        ASSERTM((const void*)this != (const void*)&b, "result matrix and multiplicand matrix 'b' must be different objects");

        ASSERTM((const void*)this != (const void*)&d, "result matrix and multiplicand matrix 'd' must be different objects");


        format();

        return add_double_mat_mult(a, b, d, alpha);

      }


      template<int l_, int snv_, int slt_, int smt_, int snt_>

      CUDA_HOST_DEVICE Matrix& add_vec_tensor_mult(

        const Vector<T_, l_, snv_>& x,

        const Tensor3<T_, l_, m_, n_, slt_, smt_, snt_>& t,

        DataType alpha = DataType(1))

      {

        for(int i(0); i < m_; ++i)

        {

          for(int j(0); j < n_; ++j)

          {

            DataType r(0);

            for(int k(0); k < l_; ++k)

            {

              r += x(k) * t(k,i,j);

            }

            v[i][j] += alpha * r;

          }

        }

        return *this;

      }


      template<int l_, int snv_, int slt_, int smt_, int snt_>

      CUDA_HOST_DEVICE Matrix& set_vec_tensor_mult(

        const Vector<T_, l_, snv_>& x,

        const Tensor3<T_, l_, m_, n_, slt_, smt_, snt_>& t,

        DataType alpha = DataType(1))

      {

        format();

        return add_vec_tensor_mult(x, t, alpha);

      }


      CUDA_HOST_DEVICE Matrix& set_identity()

      {

        for(int i(0); i < m_; ++i)

          for(int j(0); j < n_; ++j)

            v[i][j] = (i == j ? T_(1) : T_(0));

        return *this;

      }


      CUDA_HOST_DEVICE Matrix& set_rotation_2d(T_ angle)

      {

        static_assert((m_ == 2) && (n_ == 2), "this function works only for 2x2 matrices");

        #ifndef __CUDACC__

        v[0][0] =  (v[1][1] = Math::cos(angle));

        v[0][1] = -(v[1][0] = Math::sin(angle));

        #else

        v[0][0] =  (v[1][1] = CudaMath::cuda_cos(angle));

        v[0][1] = -(v[1][0] = CudaMath::cuda_sin(angle));

        #endif

        return *this;

      }


      CUDA_HOST_DEVICE Matrix& set_rotation_3d(T_ yaw, T_ pitch, T_ roll)

      {

        static_assert((m_ == 3) && (n_ == 3), "this function works only for 3x3 matrices");

        #ifndef __CUDACC__

        const T_ cy = Math::cos(yaw);

        const T_ sy = Math::sin(yaw);

        const T_ cp = Math::cos(pitch);

        const T_ sp = Math::sin(pitch);

        const T_ cr = Math::cos(roll);

        const T_ sr = Math::sin(roll);

        #else

        const T_ cy = CudaMath::cuda_cos(yaw);

        const T_ sy = CudaMath::cuda_sin(yaw);

        const T_ cp = CudaMath::cuda_cos(pitch);

        const T_ sp = CudaMath::cuda_sin(pitch);

        const T_ cr = CudaMath::cuda_cos(roll);

        const T_ sr = CudaMath::cuda_sin(roll);

        #endif

        v[0][0] = cy*cp;

        v[0][1] = cy*sp*sr - sy*cr;

        v[0][2] = cy*sp*cr + sy*sr;

        v[1][0] = sy*cp;

        v[1][1] = sy*sp*sr + cy*cr;

        v[1][2] = sy*sp*cr - cy*sr;

        v[2][0] = -sp;

        v[2][1] = cp*sr;

        v[2][2] = cp*cr;

        return *this;

      }


      CUDA_HOST_DEVICE static Matrix null()

      {

        return Matrix(DataType(0));

      }


      CUDA_HOST friend std::ostream & operator<< (std::ostream & lhs, const Matrix& A)

      {

        for (int i(0) ; i < m-1 ; ++i)

        {

          lhs << A[i] << "\n";

        }

        lhs << A[m-1];


        return lhs;

      }

    }; // class Matrix


    template<typename T_, int mx_, int smx_, int ma_, int na_, int sm_, int sn_>

    CUDA_HOST_DEVICE void orthogonal_2x1(Vector<T_, mx_, smx_>& nu, const Matrix<T_, ma_, na_, sm_, sn_>& tau)

    {

      static_assert(mx_ >= 2, "invalid nu vector size for orthogonal_2x1");

      static_assert(ma_ >= 2, "invalid matrix row size for orthogonal_2x1");

      static_assert(na_ >= 1, "invalid matrix column size for orthogonal_2x1");


      // 2d "cross" product. The sign has to be on the second component so the input is rotated in negative direction

      nu[0] =  tau[1][0];

      nu[1] = -tau[0][0];

    }


    template<typename T_, int mx_, int smx_, int ma_, int na_, int sm_, int sn_>

    CUDA_HOST_DEVICE void orthogonal_3x2(Vector<T_, mx_, smx_>& nu, const Matrix<T_, ma_, na_, sm_, sn_>& tau)

    {

      static_assert(mx_ >= 3, "invalid nu vector size for orthogonal_3x2");

      static_assert(ma_ >= 3, "invalid matrix row size for orthogonal_3x2");

      static_assert(na_ >= 2, "invalid matrix column size for orthogonal_3x2");


      // 3d cross product

      nu[0] = tau[1][0]*tau[2][1] - tau[2][0]*tau[1][1];

      nu[1] = tau[2][0]*tau[0][1] - tau[0][0]*tau[2][1];

      nu[2] = tau[0][0]*tau[1][1] - tau[1][0]*tau[0][1];

    }


#ifdef DOXYGEN

    template<typename T_, int m_, int sm_, int sn_>

    Vector<T_, m_> orthogonal(const Matrix<T_, m_, m_-1, sm_, sn_>& tau);

#endif


    template<typename T_, int sm_, int sn_>

    CUDA_HOST_DEVICE Vector<T_, 2> orthogonal(const Matrix<T_, 2, 1, sm_, sn_>& tau)

    {

      Vector<T_, 2, sm_> nu(T_(0));

      orthogonal_2x1(nu, tau);

      return nu;

    }


    template<typename T_, int sm_, int sn_>

    CUDA_HOST_DEVICE Vector<T_, 3> orthogonal(const Matrix<T_, 3, 2, sm_, sn_>& tau)

    {

      Vector<T_, 3, sm_> nu(T_(0));

      orthogonal_3x2(nu, tau);

      return nu;

    }


    template<typename T_, int m_, int n_, int sm_, int sn_, int sx_>

    CUDA_HOST_DEVICE inline Vector<T_, m_> operator*(const Matrix<T_, m_, n_, sm_, sn_>& a, const Vector<T_, n_, sx_>& x)

    {

      return Vector<T_, m_>().set_mat_vec_mult(a, x);

    }


    template<typename T_, int m_, int n_, int sm_, int sn_, int sx_>

    CUDA_HOST_DEVICE inline Vector<T_, n_> operator*(const Vector<T_, m_, sx_>& x, const Matrix<T_, m_, n_, sm_, sn_>& a)

    {

      return Vector<T_, n_>().set_vec_mat_mult(x, a);

    }


    template<typename T_, int m_, int n_, int sm_, int sn_>

    CUDA_HOST_DEVICE inline Matrix<T_, m_, n_> operator*(typename Matrix<T_, m_, n_>::DataType alpha, const Matrix<T_, m_, n_, sm_, sn_>& a)

    {

      return Matrix<T_, m_, n_>(a) *= alpha;

    }


    template<typename T_, int m_, int n_, int sm_, int sn_>

    CUDA_HOST_DEVICE inline Matrix<T_, m_, n_, sm_, sn_> operator*(const Matrix<T_, m_, n_, sm_, sn_>& a, typename Matrix<T_, m_, n_>::DataType alpha)

    {

      return Matrix<T_, m_, n_>(a) *= alpha;

    }


    template<typename T_, int m_, int n_, int l_, int sma_, int sna_, int smb_, int snb_>

    CUDA_HOST_DEVICE inline Matrix<T_, m_, n_> operator*(const Matrix<T_, m_, l_, sma_, sna_>& a, const Matrix<T_, l_, n_, smb_, snb_>& b)

    {

      return Matrix<T_, m_, n_>().set_mat_mat_mult(a, b);

    }


    template<typename T_, int m_, int n_,int sma_, int sna_, int smb_, int snb_>

    CUDA_HOST_DEVICE inline Matrix<T_, m_, n_> operator+(const Matrix<T_, m_, n_, sma_, sna_>& a, const Matrix<T_, m_, n_, smb_, snb_>& b)

    {

      return Matrix<T_, m_, n_>(a) += b;

    }


    template<typename T_, int m_, int n_,int sma_, int sna_, int smb_, int snb_>

    CUDA_HOST_DEVICE inline Matrix<T_, m_, n_> operator-(const Matrix<T_, m_, n_, sma_, sna_>& a, const Matrix<T_, m_, n_, smb_, snb_>& b)

    {

      return Matrix<T_, m_, n_>(a) -= b;

    }


    /* ************************************************************************************************************* */

    /* ************************************************************************************************************* */

    // Tiny Tensor3 implementation

    /* ************************************************************************************************************* */

    /* ************************************************************************************************************* */


    template<typename T_, int l_, int m_, int n_, int sl_, int sm_, int sn_>

    class Tensor3

    {

      static_assert(l_ > 0, "invalid tube count");

      static_assert(m_ > 0, "invalid row count");

      static_assert(n_ > 0, "invalid column count");

      static_assert(sl_ >= l_, "invalid tube stride");

      static_assert(sm_ >= m_, "invalid row stride");

      static_assert(sn_ >= n_, "invalid column stride");


    public:

      static constexpr int l = l_;

      static constexpr int m = m_;

      static constexpr int n = n_;

      static constexpr int sl = sl_;

      static constexpr int sm = sm_;

      static constexpr int sn = sn_;


      typedef T_ ValueType;

      typedef typename Intern::DataTypeExtractor<ValueType>::MyDataType DataType;


      typedef Matrix<T_, m_, n_, sm_, sn_> PlaneType;

      PlaneType v[sl_];


      CUDA_HOST_DEVICE Tensor3()

      {

      }


      CUDA_HOST_DEVICE explicit Tensor3(DataType value)

      {

        for(int i(0); i < l_; ++i)

          v[i] = value;

      }


      template<typename Tx_>

      CUDA_HOST_DEVICE explicit Tensor3(const std::initializer_list<Tx_>& x)

      {

        XASSERTM(std::size_t(l_) == x.size(), "invalid initializer list size");

        auto it(x.begin());

        for(int i(0); i < l_; ++i, ++it)

          v[i] = *it;

      }


      template<typename Tx_>

      CUDA_HOST_DEVICE explicit Tensor3(const std::initializer_list<std::initializer_list<std::initializer_list<Tx_>>>& x)

      {

        XASSERTM(std::size_t(l_) == x.size(), "invalid initializer list size");

        auto it(x.begin());

        for(int i(0); i < l_; ++i, ++it)

          v[i] = *it;

      }


      template<int sla_, int sma_, int sna_>

      CUDA_HOST_DEVICE Tensor3(const Tensor3<T_, l_, m_, n_, sla_, sma_, sna_>& a)

      {

        for(int i(0); i < l_; ++i)

          v[i] = a.v[i];

      }


      CUDA_HOST_DEVICE Tensor3& operator=(DataType value)

      {

        for(int i(0); i < l_; ++i)

          v[i] = value;

        return *this;

      }


      template<int sla_, int sma_, int sna_>

      CUDA_HOST_DEVICE Tensor3& operator=(const Tensor3<T_, l_, m_, n_, sla_, sma_, sna_>& a)

      {

        for(int i(0); i < l_; ++i)

          v[i] = a.v[i];

        return *this;

      }


      template<typename Tx_>

      CUDA_HOST_DEVICE Tensor3& operator=(const std::initializer_list<Tx_>& x)

      {

        XASSERTM(std::size_t(l_) == x.size(), "invalid initializer list size");

        auto it(x.begin());

        for(int i(0); i < l_; ++i, ++it)

          v[i] = *it;

        return *this;

      }


      template<typename Tx_>

      CUDA_HOST_DEVICE Tensor3& operator=(const std::initializer_list<std::initializer_list<std::initializer_list<Tx_>>>& x)

      {

        XASSERTM(std::size_t(l_) == x.size(), "invalid initializer list size");

        auto it(x.begin());

        for(int i(0); i < l_; ++i, ++it)

          v[i] = *it;

        return *this;

      }


      template<typename Tx_, int sla_, int sma_, int sna_>

      CUDA_HOST_DEVICE void convert(const Tensor3<Tx_, l_, m_, n_, sla_, sma_, sna_>& a)

      {

        for(int i(0); i < l_; ++i)

          v[i].convert(a.v[i]);

      }


      CUDA_HOST_DEVICE T_& operator()(int h, int i, int j)

      {

        ASSERTM( (h >= 0) && (h < l_), "index h out-of-bounds");

        ASSERTM( (i >= 0) && (i < m_), "index i out-of-bounds");

        ASSERTM( (j >= 0) && (j < n_), "index j out-of-bounds");

        return v[h](i,j);

      }


      CUDA_HOST_DEVICE const T_& operator()(int h, int i, int j) const

      {

        ASSERTM( (h >= 0) && (h < l_), "index h out-of-bounds");

        ASSERTM( (i >= 0) && (i < m_), "index i out-of-bounds");

        ASSERTM( (j >= 0) && (j < n_), "index j out-of-bounds");

        return v[h](i,j);

      }


      CUDA_HOST_DEVICE PlaneType& operator[](int h)

      {

        ASSERTM( (h >= 0) && (h < l_), "index h out-of-bounds");

        return v[h];

      }


      CUDA_HOST_DEVICE const PlaneType& operator[](int h) const

      {

        ASSERTM( (h >= 0) && (h < l_), "index h out-of-bounds");

        return v[h];

      }


      CUDA_HOST_DEVICE Tensor3& operator*=(DataType alpha)

      {

        for(int i(0); i < l_; ++i)

          v[i] *= alpha;

        return *this;

      }


      template<int sla_, int sma_, int sna_>

      CUDA_HOST_DEVICE Tensor3& operator+=(const Tensor3<T_, l_, m_, n_, sla_, sma_, sna_>& a)

      {

        for(int i(0); i < l_; ++i)

          v[i] += a.v[i];

        return *this;

      }


      template<int sla_, int sma_, int sna_>

      CUDA_HOST_DEVICE Tensor3& operator-=(const Tensor3<T_, l_, m_, n_, sla_, sma_, sna_>& a)

      {

        for(int i(0); i < l_; ++i)

          v[i] -= a.v[i];

        return *this;

      }


      template<int sla_, int sma_, int sna_>

      CUDA_HOST_DEVICE void copy(const Tensor3<T_, l_, m_, n_, sla_, sma_, sna_>& a)

      {

        for(int i(0); i < l_; ++i)

          for(int j(0); j < m_; ++j)

            for(int k(0); k < n_; ++k)

              v[i][j][k] = a.v[i][j][k];

      }


      template<int ll_,int mm_, int nn_, int la_, int ma_, int na_, int sla_, int sma_, int sna_>

      CUDA_HOST_DEVICE void copy_n(const Tensor3<T_, la_, ma_, na_, sla_, sma_, sna_>& a)

      {

        static_assert(ll_ <= l_, "invalid copy_n size");

        static_assert(ll_ <= la_, "invalid copy_n size");

        static_assert(mm_ <= m_, "invalid copy_n size");

        static_assert(mm_ <= ma_, "invalid copy_n size");

        static_assert(nn_ <= n_, "invalid copy_n size");

        static_assert(nn_ <= na_, "invalid copy_n size");

        for(int i(0); i < ll_; ++i)

          for(int j(0); j < mm_; ++j)

            for(int k(0); k < nn_; ++k)

              v[i][j][k] = a.v[i][j][k];

      }


      CUDA_HOST_DEVICE void format(DataType alpha = DataType(0))

      {

        (*this) = alpha;

      }


      template<int k_, int sma_, int sna_, int slt_, int smt_, int snt_>

      CUDA_HOST_DEVICE Tensor3& add_mat_tensor_mult(

        const Matrix<T_, l_, k_, sma_, sna_>& a,

        const Tensor3<T_, k_, m_, n_, slt_, smt_, snt_>& t,

        DataType alpha = DataType(1))

      {

        // we have to compare void* addresses here, because we might get a type mismatch error otherwise

        ASSERTM((const void*)this != (const void*)&t, "result tensor and multiplicand tensor 't' must be different objects");


        for(int h(0); h < l_; ++h)

        {

          for(int i(0); i < m_; ++i)

          {

            for(int j(0); j < n_; ++j)

            {

              DataType r(0);

              for(int p(0); p < k_; ++p)

              {

                r += a(h,p) * t(p,i,j);

              }

              operator()(h,i,j) += alpha * r;

            }

          }

        }

        return *this;

      }


      template<

        int lt_, int mt_, int nt_, // input tensor dimensions

        int slt_, int smt_, int snt_, // input tensor strides

        int smb_, int snb_, int smd_, int snd_> // input matrix strides

      CUDA_HOST_DEVICE Tensor3& add_double_mat_mult(

        const Tensor3<T_, lt_, mt_, nt_, slt_, smt_, snt_>& t,

        const Matrix<T_, nt_, n_, smb_, snb_>& b,

        const Matrix<T_, mt_, m_, smd_, snd_>& d,

        DataType alpha = DataType(1))

      {

        // we have to compare void* addresses here, because we might get a type mismatch error otherwise

        ASSERTM((const void*)this != (const void*)&t, "result tensor and multiplicand tensor 't' must be different objects");


        for(int h(0); h < l_; ++h)

        {

          for(int i(0); i < m_; ++i)

          {

            for(int j(0); j < n_; ++j)

            {

              DataType r(0);

              for(int p(0); p < mt_; ++p)

              {

                for(int q(0); q < nt_; ++q)

                {

                  r += t(h,p,q) * b(p,i) * d(q,j);

                }

              }

              operator()(h,i,j) += alpha * r;

            }

          }

        }

        return *this;

      }


      template<int slx_, int sma_, int sna_>

      CUDA_HOST_DEVICE Tensor3& add_vec_mat_outer_product(

        const Vector<T_, l_, slx_>& x,

        const Matrix<T_, m_, n_, sma_, sna_>& a,

        DataType alpha = DataType(1))

      {

        for(int h(0); h < l_; ++h)

        {

          for(int i(0); i < m_; ++i)

          {

            for(int j(0); j < n_; ++j)

            {

              operator()(h,i,j) += alpha * x(h) * a(i, j);

            }

          }

        }

        return *this;

      }


      CUDA_HOST_DEVICE static Tensor3 null()

      {

        return Tensor3(DataType(0));

      }

    }; // class Tensor3<...>


    template<typename T_, int l_, int m_, int n_, int sl_, int sm_, int sn_>

    CUDA_HOST_DEVICE inline Tensor3<T_, l_, m_, n_, sl_, sm_, sn_> operator*(

      typename Tensor3<T_, l_, m_, n_>::DataType alpha, const Tensor3<T_, l_, m_, n_, sl_, sm_, sn_>& a)

    {

      return Tensor3<T_, l_, m_, n_, sl_, sm_, sn_>(a) *= alpha;

    }


    template<typename T_, int l_, int m_, int n_, int sl_, int sm_, int sn_>

    CUDA_HOST_DEVICE inline Tensor3<T_, l_, m_, n_, sl_, sm_, sn_> operator*(

      const Tensor3<T_, l_, m_, n_, sl_, sm_, sn_>& a, typename Tensor3<T_, l_, m_, n_, sl_, sm_, sn_>::DataType alpha)

    {

      return Tensor3<T_, l_, m_, n_, sl_, sm_, sn_>(a) *= alpha;

    }


    /* ************************************************************************************************************* */

    /* ************************************************************************************************************* */

    // Various helper functions

    /* ************************************************************************************************************* */

    /* ************************************************************************************************************* */

    template<typename T_, typename std::enable_if<Intern::DataTypeExtractor<T_>::level == 0, bool>::type = true>

    CUDA_HOST_DEVICE inline T_ dot(const T_& a, const T_& b)

    {

      return a*b;

    }


    template<typename T_, int n_, int sa_, int sb_>

    CUDA_HOST_DEVICE inline typename Vector<T_, n_>::DataType dot(const Vector<T_, n_, sa_>& a, const Vector<T_, n_, sb_>& b)

    {

      typename Vector<T_, n_>::DataType r(0);


      for(int i(0); i < n_; ++i)

      {

        r += Tiny::dot(a.v[i], b.v[i]);

      }

      return r;

    }


    template<typename T_, int m_, int n_, int sma_, int sna_, int smb_, int snb_>

    CUDA_HOST_DEVICE inline typename Matrix<T_, m_, n_>::DataType dot(const Matrix<T_, m_, n_, sma_, sna_>& a, const Matrix<T_, m_, n_, smb_, snb_>& b)

    {

      typename Matrix<T_, m_, n_>::DataType r(0);

      for(int i(0); i < m_; ++i)

      {

        r += Tiny::dot(a.v[i], b.v[i]);

      }

      return r;

    }


    template<typename T_, int l_, int m_, int n_, int sla_ ,int sma_, int sna_, int slb_, int smb_, int snb_>

    CUDA_HOST_DEVICE inline typename Tensor3<T_, l_, m_, n_>::DataType dot(

      const Tensor3<T_, l_, m_, n_, sla_, sma_, sna_>& a,

      const Tensor3<T_, l_, m_, n_, slb_, smb_, snb_>& b)

    {

      typename Tensor3<T_, l_, m_, n_>::DataType r(0);

      for(int i(0); i < l_; ++i)

      {

        r += Tiny::dot(a.v[i], b.v[i]);

      }

      return r;

    }


    template<typename T_>

    CUDA_HOST_DEVICE inline void add_id(T_& x, const T_& alpha)

    {

      x += alpha;

    }


    template<typename T_, int n_, int sn_>

    CUDA_HOST_DEVICE inline void add_id(Vector<T_, n_, sn_>& x, const typename Vector<T_, n_, sn_>::DataType& alpha)

    {

      for(int i(0); i < n_; ++i)

        add_id(x(i), alpha);

    }


    template<typename T_, int n_, int sm_, int sn_>

    CUDA_HOST_DEVICE inline void add_id(Matrix<T_, n_, n_, sm_, sn_>& x, const typename Matrix<T_, n_, n_, sm_, sn_>::DataType& alpha)

    {

      for(int i(0); i < n_; ++i)

        add_id(x(i,i), alpha);

    }


    template<typename T_, int n_, int sl_, int sm_, int sn_>

    CUDA_HOST_DEVICE inline void add_id(Tensor3<T_, n_, n_, n_, sl_, sm_, sn_>& x, const typename Tensor3<T_, n_, n_, n_, sl_, sm_, sn_>::DataType& alpha)

    {

      for(int i(0); i < n_; ++i)

        add_id(x(i,i,i), alpha);

    }


    template<typename T_>

    CUDA_HOST_DEVICE inline void axpy(T_& y, const T_& x, const T_& alpha)

    {

      y += alpha*x;

    }


    template<typename T_, int n_, int sn_>

    CUDA_HOST_DEVICE inline void axpy(

      Vector<T_, n_, sn_>& y,

      const Vector<T_, n_, sn_>& x,

      const typename Vector<T_, n_, sn_>::DataType& alpha)

    {

      for(int i(0); i < n_; ++i)

        axpy(y.v[i], x.v[i], alpha);

    }


    template<typename T_, int m_, int n_, int sm_, int sn_>

    CUDA_HOST_DEVICE inline void axpy(

      Matrix<T_, m_, n_, sm_, sn_>& y,

      const Matrix<T_, m_, n_, sm_, sn_>& x,

      const typename Matrix<T_, m_, n_, sm_, sn_>::DataType& alpha)

    {

      for(int i(0); i < m_; ++i)

        axpy(y.v[i], x.v[i], alpha);

    }


    template<typename T_, int l_, int m_, int n_, int sl_, int sm_, int sn_>

    CUDA_HOST_DEVICE inline void axpy(

      Tensor3<T_, l_, m_, n_, sl_, sm_, sn_>& y,

      const Tensor3<T_, l_, m_, n_, sl_, sm_, sn_>& x,

      const typename Tensor3<T_, l_, m_, n_, sl_, sm_, sn_>::DataType& alpha)

    {

      for(int i(0); i < l_; ++i)

        axpy(y.v[i], x.v[i], alpha);

    }


    /* ************************************************************************************************************* */

    /* ************************************************************************************************************* */

    // Internal helpers implementation

    /* ************************************************************************************************************* */

    /* ************************************************************************************************************* */


    namespace Intern

    {

      // generic square matrix inversion:

      template<int n_>

      struct DetHelper<n_,n_>

      {

        template<typename T_, int sma_, int sna_>

        CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, n_, n_, sma_, sna_>& a)

        {

          // perform matrix inversion which returns the determinant

          Tiny::Matrix<T_, n_, n_> b;

          const T_ det = Intern::InverseHelper<n_, n_>::compute(b, a);


          // if the returned value is not normal, we can assume that the matrix is singular

          #ifndef __CUDACC__

          return Math::isnormal(det) ? det : T_(0);

          #else

          return CudaMath::cuda_isnormal(det) ? det : T_(0);

          #endif

        }

      };


      template<>

      struct DetHelper<1,1>

      {

        template<typename T_, int sma_, int sna_>

        CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, 1, 1, sma_, sna_>& a)

        {

          return a(0,0);

        }

      };


      template<>

      struct DetHelper<2,2>

      {

        template<typename T_, int sma_, int sna_>

        CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, 2, 2, sma_, sna_>& a)

        {

          return a(0,0)*a(1,1) - a(0,1)*a(1,0);

        }

      };


      template<>

      struct DetHelper<3,3>

      {

        template<typename T_, int sma_, int sna_>

        CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, 3, 3, sma_, sna_>& a)

        {

          return  a(0,0)*(a(1,1)*a(2,2) - a(1,2)*a(2,1))

            + a(0,1)*(a(1,2)*a(2,0) - a(1,0)*a(2,2))

            + a(0,2)*(a(1,0)*a(2,1) - a(1,1)*a(2,0));

        }

      };


      template<>

      struct DetHelper<4,4>

      {

        template<typename T_, int sma_, int sna_>

        CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, 4, 4, sma_, sna_>& a)

        {

          // 2x2 determinants of rows 3-4

          T_ w[6] =

          {

            a(2,0)*a(3,1) - a(2,1)*a(3,0),

            a(2,0)*a(3,2) - a(2,2)*a(3,0),

            a(2,0)*a(3,3) - a(2,3)*a(3,0),

            a(2,1)*a(3,2) - a(2,2)*a(3,1),

            a(2,1)*a(3,3) - a(2,3)*a(3,1),

            a(2,2)*a(3,3) - a(2,3)*a(3,2)

          };


          return

            + a(0,0) * (a(1,1)*w[5] - a(1,2)*w[4] + a(1,3)*w[3])

            - a(0,1) * (a(1,0)*w[5] - a(1,2)*w[2] + a(1,3)*w[1])

            + a(0,2) * (a(1,0)*w[4] - a(1,1)*w[2] + a(1,3)*w[0])

            - a(0,3) * (a(1,0)*w[3] - a(1,1)*w[1] + a(1,2)*w[0]);

        }

      };


      template<>

      struct DetHelper<5,5>

      {

        template<typename T_, int sma_, int sna_>

        CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, 5, 5, sma_, sna_>& a)

        {

          // 2x2 determinants of rows 4-5

          T_ v[10] =

          {

            a(3,0)*a(4,1) - a(3,1)*a(4,0),

            a(3,0)*a(4,2) - a(3,2)*a(4,0),

            a(3,0)*a(4,3) - a(3,3)*a(4,0),

            a(3,0)*a(4,4) - a(3,4)*a(4,0),

            a(3,1)*a(4,2) - a(3,2)*a(4,1),

            a(3,1)*a(4,3) - a(3,3)*a(4,1),

            a(3,1)*a(4,4) - a(3,4)*a(4,1),

            a(3,2)*a(4,3) - a(3,3)*a(4,2),

            a(3,2)*a(4,4) - a(3,4)*a(4,2),

            a(3,3)*a(4,4) - a(3,4)*a(4,3)

          };

          // 3x3 determinants of rows 3-4-5

          T_ w[10] =

          {

            a(2,0)*v[4] - a(2,1)*v[1] + a(2,2)*v[0],

            a(2,0)*v[5] - a(2,1)*v[2] + a(2,3)*v[0],

            a(2,0)*v[6] - a(2,1)*v[3] + a(2,4)*v[0],

            a(2,0)*v[7] - a(2,2)*v[2] + a(2,3)*v[1],

            a(2,0)*v[8] - a(2,2)*v[3] + a(2,4)*v[1],

            a(2,0)*v[9] - a(2,3)*v[3] + a(2,4)*v[2],

            a(2,1)*v[7] - a(2,2)*v[5] + a(2,3)*v[4],

            a(2,1)*v[8] - a(2,2)*v[6] + a(2,4)*v[4],

            a(2,1)*v[9] - a(2,3)*v[6] + a(2,4)*v[5],

            a(2,2)*v[9] - a(2,3)*v[8] + a(2,4)*v[7]

          };


          return

            + a(0,0)*(a(1,1)*w[9] - a(1,2)*w[8] + a(1,3)*w[7] - a(1,4)*w[6])

            - a(0,1)*(a(1,0)*w[9] - a(1,2)*w[5] + a(1,3)*w[4] - a(1,4)*w[3])

            + a(0,2)*(a(1,0)*w[8] - a(1,1)*w[5] + a(1,3)*w[2] - a(1,4)*w[1])

            - a(0,3)*(a(1,0)*w[7] - a(1,1)*w[4] + a(1,2)*w[2] - a(1,4)*w[0])

            + a(0,4)*(a(1,0)*w[6] - a(1,1)*w[3] + a(1,2)*w[1] - a(1,3)*w[0]);

        }

      };


      template<>

      struct DetHelper<6,6>

      {

        template<typename T_, int sma_, int sna_>

        CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, 6, 6, sma_, sna_>& a)

        {

          // 2x2 determinants of rows 5-6

          T_ v[15] =

          {

            a(4,0)*a(5,1) - a(4,1)*a(5,0),

            a(4,0)*a(5,2) - a(4,2)*a(5,0),

            a(4,0)*a(5,3) - a(4,3)*a(5,0),

            a(4,0)*a(5,4) - a(4,4)*a(5,0),

            a(4,0)*a(5,5) - a(4,5)*a(5,0),

            a(4,1)*a(5,2) - a(4,2)*a(5,1),

            a(4,1)*a(5,3) - a(4,3)*a(5,1),

            a(4,1)*a(5,4) - a(4,4)*a(5,1),

            a(4,1)*a(5,5) - a(4,5)*a(5,1),

            a(4,2)*a(5,3) - a(4,3)*a(5,2),

            a(4,2)*a(5,4) - a(4,4)*a(5,2),

            a(4,2)*a(5,5) - a(4,5)*a(5,2),

            a(4,3)*a(5,4) - a(4,4)*a(5,3),

            a(4,3)*a(5,5) - a(4,5)*a(5,3),

            a(4,4)*a(5,5) - a(4,5)*a(5,4)

          };

          // 3x3 determinants of rows 4-5-6

          T_ w[20] =

          {

            a(3,0)*v[ 5] - a(3,1)*v[ 1] + a(3,2)*v[ 0],

            a(3,0)*v[ 6] - a(3,1)*v[ 2] + a(3,3)*v[ 0],

            a(3,0)*v[ 7] - a(3,1)*v[ 3] + a(3,4)*v[ 0],

            a(3,0)*v[ 8] - a(3,1)*v[ 4] + a(3,5)*v[ 0],

            a(3,0)*v[ 9] - a(3,2)*v[ 2] + a(3,3)*v[ 1],

            a(3,0)*v[10] - a(3,2)*v[ 3] + a(3,4)*v[ 1],

            a(3,0)*v[11] - a(3,2)*v[ 4] + a(3,5)*v[ 1],

            a(3,0)*v[12] - a(3,3)*v[ 3] + a(3,4)*v[ 2],

            a(3,0)*v[13] - a(3,3)*v[ 4] + a(3,5)*v[ 2],

            a(3,0)*v[14] - a(3,4)*v[ 4] + a(3,5)*v[ 3],

            a(3,1)*v[ 9] - a(3,2)*v[ 6] + a(3,3)*v[ 5],

            a(3,1)*v[10] - a(3,2)*v[ 7] + a(3,4)*v[ 5],

            a(3,1)*v[11] - a(3,2)*v[ 8] + a(3,5)*v[ 5],

            a(3,1)*v[12] - a(3,3)*v[ 7] + a(3,4)*v[ 6],

            a(3,1)*v[13] - a(3,3)*v[ 8] + a(3,5)*v[ 6],

            a(3,1)*v[14] - a(3,4)*v[ 8] + a(3,5)*v[ 7],

            a(3,2)*v[12] - a(3,3)*v[10] + a(3,4)*v[ 9],

            a(3,2)*v[13] - a(3,3)*v[11] + a(3,5)*v[ 9],

            a(3,2)*v[14] - a(3,4)*v[11] + a(3,5)*v[10],

            a(3,3)*v[14] - a(3,4)*v[13] + a(3,5)*v[12]

          };

          // 4x4 determinants of rows 3-4-5-6

          v[ 0] = a(2,0)*w[10] - a(2,1)*w[ 4] + a(2,2)*w[ 1] - a(2,3)*w[ 0];

          v[ 1] = a(2,0)*w[11] - a(2,1)*w[ 5] + a(2,2)*w[ 2] - a(2,4)*w[ 0];

          v[ 2] = a(2,0)*w[12] - a(2,1)*w[ 6] + a(2,2)*w[ 3] - a(2,5)*w[ 0];

          v[ 3] = a(2,0)*w[13] - a(2,1)*w[ 7] + a(2,3)*w[ 2] - a(2,4)*w[ 1];

          v[ 4] = a(2,0)*w[14] - a(2,1)*w[ 8] + a(2,3)*w[ 3] - a(2,5)*w[ 1];

          v[ 5] = a(2,0)*w[15] - a(2,1)*w[ 9] + a(2,4)*w[ 3] - a(2,5)*w[ 2];

          v[ 6] = a(2,0)*w[16] - a(2,2)*w[ 7] + a(2,3)*w[ 5] - a(2,4)*w[ 4];

          v[ 7] = a(2,0)*w[17] - a(2,2)*w[ 8] + a(2,3)*w[ 6] - a(2,5)*w[ 4];

          v[ 8] = a(2,0)*w[18] - a(2,2)*w[ 9] + a(2,4)*w[ 6] - a(2,5)*w[ 5];

          v[ 9] = a(2,0)*w[19] - a(2,3)*w[ 9] + a(2,4)*w[ 8] - a(2,5)*w[ 7];

          v[10] = a(2,1)*w[16] - a(2,2)*w[13] + a(2,3)*w[11] - a(2,4)*w[10];

          v[11] = a(2,1)*w[17] - a(2,2)*w[14] + a(2,3)*w[12] - a(2,5)*w[10];

          v[12] = a(2,1)*w[18] - a(2,2)*w[15] + a(2,4)*w[12] - a(2,5)*w[11];

          v[13] = a(2,1)*w[19] - a(2,3)*w[15] + a(2,4)*w[14] - a(2,5)*w[13];

          v[14] = a(2,2)*w[19] - a(2,3)*w[18] + a(2,4)*w[17] - a(2,5)*w[16];


          return

            + a(0,0)*(a(1,1)*v[14] - a(1,2)*v[13] + a(1,3)*v[12] - a(1,4)*v[11] + a(1,5)*v[10])

            - a(0,1)*(a(1,0)*v[14] - a(1,2)*v[ 9] + a(1,3)*v[ 8] - a(1,4)*v[ 7] + a(1,5)*v[ 6])

            + a(0,2)*(a(1,0)*v[13] - a(1,1)*v[ 9] + a(1,3)*v[ 5] - a(1,4)*v[ 4] + a(1,5)*v[ 3])

            - a(0,3)*(a(1,0)*v[12] - a(1,1)*v[ 8] + a(1,2)*v[ 5] - a(1,4)*v[ 2] + a(1,5)*v[ 1])

            + a(0,4)*(a(1,0)*v[11] - a(1,1)*v[ 7] + a(1,2)*v[ 4] - a(1,3)*v[ 2] + a(1,5)*v[ 0])

            - a(0,5)*(a(1,0)*v[10] - a(1,1)*v[ 6] + a(1,2)*v[ 3] - a(1,3)*v[ 1] + a(1,4)*v[ 0]);

        }

      };


      template<int m_, int n_>

      struct VolHelper

      {

        template<typename T_, int sma_, int sna_>

        CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, n_, n_, sma_, sna_>& a)

        {

          // generic fallback implementation: compute b := a^T * a and return sqrt(det(b))

          Tiny::Matrix<T_, n_, n_> b;

          for(int i(0); i < n_; ++i)

          {

            for(int j(0); j < n_; ++j)

            {

              b(i,j) = T_(0);

              for(int k(0); k < m_; ++k)

              {

                b(i,j) += a(k,i)*a(k,j);

              }

            }

          }

          #ifndef __CUDACC__

          return Math::sqrt(DetHelper<n_,n_>::compute(b));

          #else

          return CudaMath::cuda_sqrt(DetHelper<n_,n_>::compute(b));

          #endif

        }

      };


      template<int n_>

      struct VolHelper<n_,n_>

      {

        template<typename T_, int sma_, int sna_>

        CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, n_, n_, sma_, sna_>& a)

        {

          // square matrix special case: vol(a) = abs(det(a))

          #ifndef __CUDACC__

          return Math::abs(DetHelper<n_,n_>::compute(a));

          #else

          return CudaMath::cuda_abs(DetHelper<n_,n_>::compute(a));

          #endif

        }

      };


      template<>

      struct VolHelper<2,1>

      {

        template<typename T_, int sma_, int sna_>

        CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, 2, 1, sma_, sna_>& a)

        {

          // This is the euclid norm of the only matrix column.

          #ifndef __CUDACC__

          return Math::sqrt(Math::sqr(a(0,0)) + Math::sqr(a(1,0)));

          #else

          return CudaMath::cuda_sqrt(CudaMath::cuda_sqr(a(0,0)) + CudaMath::cuda_sqr(a(1,0)));

          #endif

        }

      };


      template<>

      struct VolHelper<3,1>

      {

        template<typename T_, int sma_, int sna_>

        CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, 3, 1, sma_, sna_>& a)

        {

          // This is the euclid norm of the only matrix column.

          #ifndef __CUDACC__

          return Math::sqrt(Math::sqr(a(0,0)) + Math::sqr(a(1,0)) + Math::sqr(a(2,0)));

          #else

          return CudaMath::cuda_sqrt(CudaMath::cuda_sqr(a(0,0)) + CudaMath::cuda_sqr(a(1,0)) + CudaMath::cuda_sqr(a(2,0)));

          #endif

        }

      };


      template<>

      struct VolHelper<3,2>

      {

        template<typename T_, int sma_, int sna_>

        CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, 3, 2, sma_, sna_>& a)

        {

          // This is the euclid norm of the 3D cross product of the two matrix columns.

          #ifndef __CUDACC__

          return Math::sqrt(

            Math::sqr(a(1,0)*a(2,1) - a(2,0)*a(1,1)) +

            Math::sqr(a(2,0)*a(0,1) - a(0,0)*a(2,1)) +

            Math::sqr(a(0,0)*a(1,1) - a(1,0)*a(0,1)));

          #else

          return CudaMath::cuda_sqrt(

            CudaMath::cuda_sqr(a(1,0)*a(2,1) - a(2,0)*a(1,1)) +

            CudaMath::cuda_sqr(a(2,0)*a(0,1) - a(0,0)*a(2,1)) +

            CudaMath::cuda_sqr(a(0,0)*a(1,1) - a(1,0)*a(0,1)));

          #endif

        }

      };


      template<>

      struct InverseHelper<1,1>

      {

        template<typename T_, int smb_, int snb_, int sma_, int sna_>

        CUDA_HOST_DEVICE static T_ compute(Tiny::Matrix<T_, 1, 1, smb_, snb_>& b, const Tiny::Matrix<T_, 1, 1, sma_, sna_>& a)

        {

          b(0,0) = T_(1) / a(0,0);

          return a(0,0);

        }

      };


      template<>

      struct InverseHelper<2,2>

      {

        template<typename T_, int smb_, int snb_, int sma_, int sna_>

        CUDA_HOST_DEVICE static T_ compute(Tiny::Matrix<T_, 2, 2, smb_, snb_>& b, const Tiny::Matrix<T_, 2, 2, sma_, sna_>& a)

        {

          T_ det = a(0,0)*a(1,1) - a(0,1)*a(1,0);

          T_ d = T_(1) / det;

          b(0,0) =  d*a(1,1);

          b(0,1) = -d*a(0,1);

          b(1,0) = -d*a(1,0);

          b(1,1) =  d*a(0,0);

          return det;

        }

      };


      template<>

      struct InverseHelper<3,3>

      {

        template<typename T_, int smb_, int snb_, int sma_, int sna_>

        CUDA_HOST_DEVICE static T_ compute(Tiny::Matrix<T_, 3, 3, smb_, snb_>& b, const Tiny::Matrix<T_, 3, 3, sma_, sna_>& a)

        {

          b(0,0) = a(1,1)*a(2,2) - a(1,2)*a(2,1);

          b(1,0) = a(1,2)*a(2,0) - a(1,0)*a(2,2);

          b(2,0) = a(1,0)*a(2,1) - a(1,1)*a(2,0);

          T_ det = a(0,0)*b(0,0) + a(0,1)*b(1,0) + a(0,2)*b(2,0);

          T_ d = T_(1) / det;

          b(0,0) *= d;

          b(1,0) *= d;

          b(2,0) *= d;

          b(0,1) = d*(a(0,2)*a(2,1) - a(0,1)*a(2,2));

          b(1,1) = d*(a(0,0)*a(2,2) - a(0,2)*a(2,0));

          b(2,1) = d*(a(0,1)*a(2,0) - a(0,0)*a(2,1));

          b(0,2) = d*(a(0,1)*a(1,2) - a(0,2)*a(1,1));

          b(1,2) = d*(a(0,2)*a(1,0) - a(0,0)*a(1,2));

          b(2,2) = d*(a(0,0)*a(1,1) - a(0,1)*a(1,0));

          return det;

        }

      };


      template<>

      struct InverseHelper<4,4>

      {

        template<typename T_, int smb_, int snb_, int sma_, int sna_>

        CUDA_HOST_DEVICE static T_ compute(Tiny::Matrix<T_, 4, 4, smb_, snb_>& b, const Tiny::Matrix<T_, 4, 4, sma_, sna_>& a)

        {

          T_ w[6];

          w[0] = a(2,0)*a(3,1)-a(2,1)*a(3,0);

          w[1] = a(2,0)*a(3,2)-a(2,2)*a(3,0);

          w[2] = a(2,0)*a(3,3)-a(2,3)*a(3,0);

          w[3] = a(2,1)*a(3,2)-a(2,2)*a(3,1);

          w[4] = a(2,1)*a(3,3)-a(2,3)*a(3,1);

          w[5] = a(2,2)*a(3,3)-a(2,3)*a(3,2);

          b(0,0) = a(1,1)*w[5]-a(1,2)*w[4]+a(1,3)*w[3];

          b(1,0) =-a(1,0)*w[5]+a(1,2)*w[2]-a(1,3)*w[1];

          b(2,0) = a(1,0)*w[4]-a(1,1)*w[2]+a(1,3)*w[0];

          b(3,0) =-a(1,0)*w[3]+a(1,1)*w[1]-a(1,2)*w[0];

          T_ det = a(0,0)*b(0,0)+a(0,1)*b(1,0)+a(0,2)*b(2,0)+a(0,3)*b(3,0);

          T_ d = T_(1) / det;

          b(0,0) *= d;

          b(1,0) *= d;

          b(2,0) *= d;

          b(3,0) *= d;

          b(0,1) = d*(-a(0,1)*w[5]+a(0,2)*w[4]-a(0,3)*w[3]);

          b(1,1) = d*( a(0,0)*w[5]-a(0,2)*w[2]+a(0,3)*w[1]);

          b(2,1) = d*(-a(0,0)*w[4]+a(0,1)*w[2]-a(0,3)*w[0]);

          b(3,1) = d*( a(0,0)*w[3]-a(0,1)*w[1]+a(0,2)*w[0]);

          w[0] = a(0,0)*a(1,1)-a(0,1)*a(1,0);

          w[1] = a(0,0)*a(1,2)-a(0,2)*a(1,0);

          w[2] = a(0,0)*a(1,3)-a(0,3)*a(1,0);

          w[3] = a(0,1)*a(1,2)-a(0,2)*a(1,1);

          w[4] = a(0,1)*a(1,3)-a(0,3)*a(1,1);

          w[5] = a(0,2)*a(1,3)-a(0,3)*a(1,2);

          b(0,2) = d*( a(3,1)*w[5]-a(3,2)*w[4]+a(3,3)*w[3]);

          b(1,2) = d*(-a(3,0)*w[5]+a(3,2)*w[2]-a(3,3)*w[1]);

          b(2,2) = d*( a(3,0)*w[4]-a(3,1)*w[2]+a(3,3)*w[0]);

          b(3,2) = d*(-a(3,0)*w[3]+a(3,1)*w[1]-a(3,2)*w[0]);

          b(0,3) = d*(-a(2,1)*w[5]+a(2,2)*w[4]-a(2,3)*w[3]);

          b(1,3) = d*( a(2,0)*w[5]-a(2,2)*w[2]+a(2,3)*w[1]);

          b(2,3) = d*(-a(2,0)*w[4]+a(2,1)*w[2]-a(2,3)*w[0]);

          b(3,3) = d*( a(2,0)*w[3]-a(2,1)*w[1]+a(2,2)*w[0]);

          return det;

        }

      };


      template<>

      struct InverseHelper<5,5>

      {

        template<typename T_, int smb_, int snb_, int sma_, int sna_>

        CUDA_HOST_DEVICE static T_ compute(Tiny::Matrix<T_, 5, 5, smb_, snb_>& b, const Tiny::Matrix<T_, 5, 5, sma_, sna_>& a)

        {

          T_ w[20];

          w[ 0] = a(3,0)*a(4,1)-a(3,1)*a(4,0);

          w[ 1] = a(3,0)*a(4,2)-a(3,2)*a(4,0);

          w[ 2] = a(3,0)*a(4,3)-a(3,3)*a(4,0);

          w[ 3] = a(3,0)*a(4,4)-a(3,4)*a(4,0);

          w[ 4] = a(3,1)*a(4,2)-a(3,2)*a(4,1);

          w[ 5] = a(3,1)*a(4,3)-a(3,3)*a(4,1);

          w[ 6] = a(3,1)*a(4,4)-a(3,4)*a(4,1);

          w[ 7] = a(3,2)*a(4,3)-a(3,3)*a(4,2);

          w[ 8] = a(3,2)*a(4,4)-a(3,4)*a(4,2);

          w[ 9] = a(3,3)*a(4,4)-a(3,4)*a(4,3);

          w[10] = a(2,0)*w[4]-a(2,1)*w[1]+a(2,2)*w[0];

          w[11] = a(2,0)*w[5]-a(2,1)*w[2]+a(2,3)*w[0];

          w[12] = a(2,0)*w[6]-a(2,1)*w[3]+a(2,4)*w[0];

          w[13] = a(2,0)*w[7]-a(2,2)*w[2]+a(2,3)*w[1];

          w[14] = a(2,0)*w[8]-a(2,2)*w[3]+a(2,4)*w[1];

          w[15] = a(2,0)*w[9]-a(2,3)*w[3]+a(2,4)*w[2];

          w[16] = a(2,1)*w[7]-a(2,2)*w[5]+a(2,3)*w[4];

          w[17] = a(2,1)*w[8]-a(2,2)*w[6]+a(2,4)*w[4];

          w[18] = a(2,1)*w[9]-a(2,3)*w[6]+a(2,4)*w[5];

          w[19] = a(2,2)*w[9]-a(2,3)*w[8]+a(2,4)*w[7];

          b(0,0) = a(1,1)*w[19]-a(1,2)*w[18]+a(1,3)*w[17]-a(1,4)*w[16];

          b(1,0) =-a(1,0)*w[19]+a(1,2)*w[15]-a(1,3)*w[14]+a(1,4)*w[13];

          b(2,0) = a(1,0)*w[18]-a(1,1)*w[15]+a(1,3)*w[12]-a(1,4)*w[11];

          b(3,0) =-a(1,0)*w[17]+a(1,1)*w[14]-a(1,2)*w[12]+a(1,4)*w[10];

          b(4,0) = a(1,0)*w[16]-a(1,1)*w[13]+a(1,2)*w[11]-a(1,3)*w[10];

          T_ det = a(0,0)*b(0,0)+a(0,1)*b(1,0)+a(0,2)*b(2,0)+a(0,3)*b(3,0)+a(0,4)*b(4,0);

          T_ d = T_(1) / det;

          b(0,0) *= d;

          b(1,0) *= d;

          b(2,0) *= d;

          b(3,0) *= d;

          b(4,0) *= d;

          b(0,1) = d*(-a(0,1)*w[19]+a(0,2)*w[18]-a(0,3)*w[17]+a(0,4)*w[16]);

          b(1,1) = d*( a(0,0)*w[19]-a(0,2)*w[15]+a(0,3)*w[14]-a(0,4)*w[13]);

          b(2,1) = d*(-a(0,0)*w[18]+a(0,1)*w[15]-a(0,3)*w[12]+a(0,4)*w[11]);

          b(3,1) = d*( a(0,0)*w[17]-a(0,1)*w[14]+a(0,2)*w[12]-a(0,4)*w[10]);

          b(4,1) = d*(-a(0,0)*w[16]+a(0,1)*w[13]-a(0,2)*w[11]+a(0,3)*w[10]);

          w[10] = a(1,0)*w[4]-a(1,1)*w[1]+a(1,2)*w[0];

          w[11] = a(1,0)*w[5]-a(1,1)*w[2]+a(1,3)*w[0];

          w[12] = a(1,0)*w[6]-a(1,1)*w[3]+a(1,4)*w[0];

          w[13] = a(1,0)*w[7]-a(1,2)*w[2]+a(1,3)*w[1];

          w[14] = a(1,0)*w[8]-a(1,2)*w[3]+a(1,4)*w[1];

          w[15] = a(1,0)*w[9]-a(1,3)*w[3]+a(1,4)*w[2];

          w[16] = a(1,1)*w[7]-a(1,2)*w[5]+a(1,3)*w[4];

          w[17] = a(1,1)*w[8]-a(1,2)*w[6]+a(1,4)*w[4];

          w[18] = a(1,1)*w[9]-a(1,3)*w[6]+a(1,4)*w[5];

          w[19] = a(1,2)*w[9]-a(1,3)*w[8]+a(1,4)*w[7];

          b(0,2) = d*( a(0,1)*w[19]-a(0,2)*w[18]+a(0,3)*w[17]-a(0,4)*w[16]);

          b(1,2) = d*(-a(0,0)*w[19]+a(0,2)*w[15]-a(0,3)*w[14]+a(0,4)*w[13]);

          b(2,2) = d*( a(0,0)*w[18]-a(0,1)*w[15]+a(0,3)*w[12]-a(0,4)*w[11]);

          b(3,2) = d*(-a(0,0)*w[17]+a(0,1)*w[14]-a(0,2)*w[12]+a(0,4)*w[10]);

          b(4,2) = d*( a(0,0)*w[16]-a(0,1)*w[13]+a(0,2)*w[11]-a(0,3)*w[10]);

          w[ 0] = a(0,0)*a(1,1)-a(0,1)*a(1,0);

          w[ 1] = a(0,0)*a(1,2)-a(0,2)*a(1,0);

          w[ 2] = a(0,0)*a(1,3)-a(0,3)*a(1,0);

          w[ 3] = a(0,0)*a(1,4)-a(0,4)*a(1,0);

          w[ 4] = a(0,1)*a(1,2)-a(0,2)*a(1,1);

          w[ 5] = a(0,1)*a(1,3)-a(0,3)*a(1,1);

          w[ 6] = a(0,1)*a(1,4)-a(0,4)*a(1,1);

          w[ 7] = a(0,2)*a(1,3)-a(0,3)*a(1,2);

          w[ 8] = a(0,2)*a(1,4)-a(0,4)*a(1,2);

          w[ 9] = a(0,3)*a(1,4)-a(0,4)*a(1,3);

          w[10] = a(2,0)*w[4]-a(2,1)*w[1]+a(2,2)*w[0];

          w[11] = a(2,0)*w[5]-a(2,1)*w[2]+a(2,3)*w[0];

          w[12] = a(2,0)*w[6]-a(2,1)*w[3]+a(2,4)*w[0];

          w[13] = a(2,0)*w[7]-a(2,2)*w[2]+a(2,3)*w[1];

          w[14] = a(2,0)*w[8]-a(2,2)*w[3]+a(2,4)*w[1];

          w[15] = a(2,0)*w[9]-a(2,3)*w[3]+a(2,4)*w[2];

          w[16] = a(2,1)*w[7]-a(2,2)*w[5]+a(2,3)*w[4];

          w[17] = a(2,1)*w[8]-a(2,2)*w[6]+a(2,4)*w[4];

          w[18] = a(2,1)*w[9]-a(2,3)*w[6]+a(2,4)*w[5];

          w[19] = a(2,2)*w[9]-a(2,3)*w[8]+a(2,4)*w[7];

          b(0,3) = d*( a(4,1)*w[19]-a(4,2)*w[18]+a(4,3)*w[17]-a(4,4)*w[16]);

          b(1,3) = d*(-a(4,0)*w[19]+a(4,2)*w[15]-a(4,3)*w[14]+a(4,4)*w[13]);

          b(2,3) = d*( a(4,0)*w[18]-a(4,1)*w[15]+a(4,3)*w[12]-a(4,4)*w[11]);

          b(3,3) = d*(-a(4,0)*w[17]+a(4,1)*w[14]-a(4,2)*w[12]+a(4,4)*w[10]);

          b(4,3) = d*( a(4,0)*w[16]-a(4,1)*w[13]+a(4,2)*w[11]-a(4,3)*w[10]);

          b(0,4) = d*(-a(3,1)*w[19]+a(3,2)*w[18]-a(3,3)*w[17]+a(3,4)*w[16]);

          b(1,4) = d*( a(3,0)*w[19]-a(3,2)*w[15]+a(3,3)*w[14]-a(3,4)*w[13]);

          b(2,4) = d*(-a(3,0)*w[18]+a(3,1)*w[15]-a(3,3)*w[12]+a(3,4)*w[11]);

          b(3,4) = d*( a(3,0)*w[17]-a(3,1)*w[14]+a(3,2)*w[12]-a(3,4)*w[10]);

          b(4,4) = d*(-a(3,0)*w[16]+a(3,1)*w[13]-a(3,2)*w[11]+a(3,3)*w[10]);

          return det;

        }

      };


      template<>

      struct InverseHelper<6,6>

      {

        template<typename T_, int smb_, int snb_, int sma_, int sna_>

        CUDA_HOST_DEVICE static T_ compute(Tiny::Matrix<T_, 6, 6, smb_, snb_>& b, const Tiny::Matrix<T_, 6, 6, sma_, sna_>& a)

        {

          T_ w[35];

          w[ 0] = a(4,0)*a(5,1)-a(4,1)*a(5,0);

          w[ 1] = a(4,0)*a(5,2)-a(4,2)*a(5,0);

          w[ 2] = a(4,0)*a(5,3)-a(4,3)*a(5,0);

          w[ 3] = a(4,0)*a(5,4)-a(4,4)*a(5,0);

          w[ 4] = a(4,0)*a(5,5)-a(4,5)*a(5,0);

          w[ 5] = a(4,1)*a(5,2)-a(4,2)*a(5,1);

          w[ 6] = a(4,1)*a(5,3)-a(4,3)*a(5,1);

          w[ 7] = a(4,1)*a(5,4)-a(4,4)*a(5,1);

          w[ 8] = a(4,1)*a(5,5)-a(4,5)*a(5,1);

          w[ 9] = a(4,2)*a(5,3)-a(4,3)*a(5,2);

          w[10] = a(4,2)*a(5,4)-a(4,4)*a(5,2);

          w[11] = a(4,2)*a(5,5)-a(4,5)*a(5,2);

          w[12] = a(4,3)*a(5,4)-a(4,4)*a(5,3);

          w[13] = a(4,3)*a(5,5)-a(4,5)*a(5,3);

          w[14] = a(4,4)*a(5,5)-a(4,5)*a(5,4);

          w[15] = a(3,0)*w[5]-a(3,1)*w[1]+a(3,2)*w[0];

          w[16] = a(3,0)*w[6]-a(3,1)*w[2]+a(3,3)*w[0];

          w[17] = a(3,0)*w[7]-a(3,1)*w[3]+a(3,4)*w[0];

          w[18] = a(3,0)*w[8]-a(3,1)*w[4]+a(3,5)*w[0];

          w[19] = a(3,0)*w[9]-a(3,2)*w[2]+a(3,3)*w[1];

          w[20] = a(3,0)*w[10]-a(3,2)*w[3]+a(3,4)*w[1];

          w[21] = a(3,0)*w[11]-a(3,2)*w[4]+a(3,5)*w[1];

          w[22] = a(3,0)*w[12]-a(3,3)*w[3]+a(3,4)*w[2];

          w[23] = a(3,0)*w[13]-a(3,3)*w[4]+a(3,5)*w[2];

          w[24] = a(3,0)*w[14]-a(3,4)*w[4]+a(3,5)*w[3];

          w[25] = a(3,1)*w[9]-a(3,2)*w[6]+a(3,3)*w[5];

          w[26] = a(3,1)*w[10]-a(3,2)*w[7]+a(3,4)*w[5];

          w[27] = a(3,1)*w[11]-a(3,2)*w[8]+a(3,5)*w[5];

          w[28] = a(3,1)*w[12]-a(3,3)*w[7]+a(3,4)*w[6];

          w[29] = a(3,1)*w[13]-a(3,3)*w[8]+a(3,5)*w[6];

          w[30] = a(3,1)*w[14]-a(3,4)*w[8]+a(3,5)*w[7];

          w[31] = a(3,2)*w[12]-a(3,3)*w[10]+a(3,4)*w[9];

          w[32] = a(3,2)*w[13]-a(3,3)*w[11]+a(3,5)*w[9];

          w[33] = a(3,2)*w[14]-a(3,4)*w[11]+a(3,5)*w[10];

          w[34] = a(3,3)*w[14]-a(3,4)*w[13]+a(3,5)*w[12];

          w[ 0] = a(2,0)*w[25]-a(2,1)*w[19]+a(2,2)*w[16]-a(2,3)*w[15];

          w[ 1] = a(2,0)*w[26]-a(2,1)*w[20]+a(2,2)*w[17]-a(2,4)*w[15];

          w[ 2] = a(2,0)*w[27]-a(2,1)*w[21]+a(2,2)*w[18]-a(2,5)*w[15];

          w[ 3] = a(2,0)*w[28]-a(2,1)*w[22]+a(2,3)*w[17]-a(2,4)*w[16];

          w[ 4] = a(2,0)*w[29]-a(2,1)*w[23]+a(2,3)*w[18]-a(2,5)*w[16];

          w[ 5] = a(2,0)*w[30]-a(2,1)*w[24]+a(2,4)*w[18]-a(2,5)*w[17];

          w[ 6] = a(2,0)*w[31]-a(2,2)*w[22]+a(2,3)*w[20]-a(2,4)*w[19];

          w[ 7] = a(2,0)*w[32]-a(2,2)*w[23]+a(2,3)*w[21]-a(2,5)*w[19];

          w[ 8] = a(2,0)*w[33]-a(2,2)*w[24]+a(2,4)*w[21]-a(2,5)*w[20];

          w[ 9] = a(2,0)*w[34]-a(2,3)*w[24]+a(2,4)*w[23]-a(2,5)*w[22];

          w[10] = a(2,1)*w[31]-a(2,2)*w[28]+a(2,3)*w[26]-a(2,4)*w[25];

          w[11] = a(2,1)*w[32]-a(2,2)*w[29]+a(2,3)*w[27]-a(2,5)*w[25];

          w[12] = a(2,1)*w[33]-a(2,2)*w[30]+a(2,4)*w[27]-a(2,5)*w[26];

          w[13] = a(2,1)*w[34]-a(2,3)*w[30]+a(2,4)*w[29]-a(2,5)*w[28];

          w[14] = a(2,2)*w[34]-a(2,3)*w[33]+a(2,4)*w[32]-a(2,5)*w[31];

          b(0,0) =  a(1,1)*w[14]-a(1,2)*w[13]+a(1,3)*w[12]-a(1,4)*w[11]+a(1,5)*w[10];

          b(1,0) = -a(1,0)*w[14]+a(1,2)*w[9]-a(1,3)*w[8]+a(1,4)*w[7]-a(1,5)*w[6];

          b(2,0) =  a(1,0)*w[13]-a(1,1)*w[9]+a(1,3)*w[5]-a(1,4)*w[4]+a(1,5)*w[3];

          b(3,0) = -a(1,0)*w[12]+a(1,1)*w[8]-a(1,2)*w[5]+a(1,4)*w[2]-a(1,5)*w[1];

          b(4,0) =  a(1,0)*w[11]-a(1,1)*w[7]+a(1,2)*w[4]-a(1,3)*w[2]+a(1,5)*w[0];

          b(5,0) = -a(1,0)*w[10]+a(1,1)*w[6]-a(1,2)*w[3]+a(1,3)*w[1]-a(1,4)*w[0];

          T_ det = a(0,0)*b(0,0) + a(0,1)*b(1,0) + a(0,2)*b(2,0)

            + a(0,3)*b(3,0) + a(0,4)*b(4,0) + a(0,5)*b(5,0);

          T_ d = T_(1) / det;

          b(0,0) *= d;

          b(1,0) *= d;

          b(2,0) *= d;

          b(3,0) *= d;

          b(4,0) *= d;

          b(5,0) *= d;

          b(0,1) = d*(-a(0,1)*w[14]+a(0,2)*w[13]-a(0,3)*w[12]+a(0,4)*w[11]-a(0,5)*w[10]);

          b(1,1) = d*( a(0,0)*w[14]-a(0,2)*w[9]+a(0,3)*w[8]-a(0,4)*w[7]+a(0,5)*w[6]);

          b(2,1) = d*(-a(0,0)*w[13]+a(0,1)*w[9]-a(0,3)*w[5]+a(0,4)*w[4]-a(0,5)*w[3]);

          b(3,1) = d*( a(0,0)*w[12]-a(0,1)*w[8]+a(0,2)*w[5]-a(0,4)*w[2]+a(0,5)*w[1]);

          b(4,1) = d*(-a(0,0)*w[11]+a(0,1)*w[7]-a(0,2)*w[4]+a(0,3)*w[2]-a(0,5)*w[0]);

          b(5,1) = d*( a(0,0)*w[10]-a(0,1)*w[6]+a(0,2)*w[3]-a(0,3)*w[1]+a(0,4)*w[0]);

          w[ 0] = a(4,0)*a(5,1)-a(4,1)*a(5,0);

          w[ 1] = a(4,0)*a(5,2)-a(4,2)*a(5,0);

          w[ 2] = a(4,0)*a(5,3)-a(4,3)*a(5,0);

          w[ 3] = a(4,0)*a(5,4)-a(4,4)*a(5,0);

          w[ 4] = a(4,0)*a(5,5)-a(4,5)*a(5,0);

          w[ 5] = a(4,1)*a(5,2)-a(4,2)*a(5,1);

          w[ 6] = a(4,1)*a(5,3)-a(4,3)*a(5,1);

          w[ 7] = a(4,1)*a(5,4)-a(4,4)*a(5,1);

          w[ 8] = a(4,1)*a(5,5)-a(4,5)*a(5,1);

          w[ 9] = a(4,2)*a(5,3)-a(4,3)*a(5,2);

          w[10] = a(4,2)*a(5,4)-a(4,4)*a(5,2);

          w[11] = a(4,2)*a(5,5)-a(4,5)*a(5,2);

          w[12] = a(4,3)*a(5,4)-a(4,4)*a(5,3);

          w[13] = a(4,3)*a(5,5)-a(4,5)*a(5,3);

          w[14] = a(4,4)*a(5,5)-a(4,5)*a(5,4);

          w[15] = a(1,0)*w[5]-a(1,1)*w[1]+a(1,2)*w[0];

          w[16] = a(1,0)*w[6]-a(1,1)*w[2]+a(1,3)*w[0];

          w[17] = a(1,0)*w[7]-a(1,1)*w[3]+a(1,4)*w[0];

          w[18] = a(1,0)*w[8]-a(1,1)*w[4]+a(1,5)*w[0];

          w[19] = a(1,0)*w[9]-a(1,2)*w[2]+a(1,3)*w[1];

          w[20] = a(1,0)*w[10]-a(1,2)*w[3]+a(1,4)*w[1];

          w[21] = a(1,0)*w[11]-a(1,2)*w[4]+a(1,5)*w[1];

          w[22] = a(1,0)*w[12]-a(1,3)*w[3]+a(1,4)*w[2];

          w[23] = a(1,0)*w[13]-a(1,3)*w[4]+a(1,5)*w[2];

          w[24] = a(1,0)*w[14]-a(1,4)*w[4]+a(1,5)*w[3];

          w[25] = a(1,1)*w[9]-a(1,2)*w[6]+a(1,3)*w[5];

          w[26] = a(1,1)*w[10]-a(1,2)*w[7]+a(1,4)*w[5];

          w[27] = a(1,1)*w[11]-a(1,2)*w[8]+a(1,5)*w[5];

          w[28] = a(1,1)*w[12]-a(1,3)*w[7]+a(1,4)*w[6];

          w[29] = a(1,1)*w[13]-a(1,3)*w[8]+a(1,5)*w[6];

          w[30] = a(1,1)*w[14]-a(1,4)*w[8]+a(1,5)*w[7];

          w[31] = a(1,2)*w[12]-a(1,3)*w[10]+a(1,4)*w[9];

          w[32] = a(1,2)*w[13]-a(1,3)*w[11]+a(1,5)*w[9];

          w[33] = a(1,2)*w[14]-a(1,4)*w[11]+a(1,5)*w[10];

          w[34] = a(1,3)*w[14]-a(1,4)*w[13]+a(1,5)*w[12];

          w[ 0] = a(0,0)*w[25]-a(0,1)*w[19]+a(0,2)*w[16]-a(0,3)*w[15];

          w[ 1] = a(0,0)*w[26]-a(0,1)*w[20]+a(0,2)*w[17]-a(0,4)*w[15];

          w[ 2] = a(0,0)*w[27]-a(0,1)*w[21]+a(0,2)*w[18]-a(0,5)*w[15];

          w[ 3] = a(0,0)*w[28]-a(0,1)*w[22]+a(0,3)*w[17]-a(0,4)*w[16];

          w[ 4] = a(0,0)*w[29]-a(0,1)*w[23]+a(0,3)*w[18]-a(0,5)*w[16];

          w[ 5] = a(0,0)*w[30]-a(0,1)*w[24]+a(0,4)*w[18]-a(0,5)*w[17];

          w[ 6] = a(0,0)*w[31]-a(0,2)*w[22]+a(0,3)*w[20]-a(0,4)*w[19];

          w[ 7] = a(0,0)*w[32]-a(0,2)*w[23]+a(0,3)*w[21]-a(0,5)*w[19];

          w[ 8] = a(0,0)*w[33]-a(0,2)*w[24]+a(0,4)*w[21]-a(0,5)*w[20];

          w[ 9] = a(0,0)*w[34]-a(0,3)*w[24]+a(0,4)*w[23]-a(0,5)*w[22];

          w[10] = a(0,1)*w[31]-a(0,2)*w[28]+a(0,3)*w[26]-a(0,4)*w[25];

          w[11] = a(0,1)*w[32]-a(0,2)*w[29]+a(0,3)*w[27]-a(0,5)*w[25];

          w[12] = a(0,1)*w[33]-a(0,2)*w[30]+a(0,4)*w[27]-a(0,5)*w[26];

          w[13] = a(0,1)*w[34]-a(0,3)*w[30]+a(0,4)*w[29]-a(0,5)*w[28];

          w[14] = a(0,2)*w[34]-a(0,3)*w[33]+a(0,4)*w[32]-a(0,5)*w[31];

          b(0,2) = d*( a(3,1)*w[14]-a(3,2)*w[13]+a(3,3)*w[12]-a(3,4)*w[11]+a(3,5)*w[10]);

          b(1,2) = d*(-a(3,0)*w[14]+a(3,2)*w[9]-a(3,3)*w[8]+a(3,4)*w[7]-a(3,5)*w[6]);

          b(2,2) = d*( a(3,0)*w[13]-a(3,1)*w[9]+a(3,3)*w[5]-a(3,4)*w[4]+a(3,5)*w[3]);

          b(3,2) = d*(-a(3,0)*w[12]+a(3,1)*w[8]-a(3,2)*w[5]+a(3,4)*w[2]-a(3,5)*w[1]);

          b(4,2) = d*( a(3,0)*w[11]-a(3,1)*w[7]+a(3,2)*w[4]-a(3,3)*w[2]+a(3,5)*w[0]);

          b(5,2) = d*(-a(3,0)*w[10]+a(3,1)*w[6]-a(3,2)*w[3]+a(3,3)*w[1]-a(3,4)*w[0]);

          b(0,3) = d*(-a(2,1)*w[14]+a(2,2)*w[13]-a(2,3)*w[12]+a(2,4)*w[11]-a(2,5)*w[10]);

          b(1,3) = d*( a(2,0)*w[14]-a(2,2)*w[9]+a(2,3)*w[8]-a(2,4)*w[7]+a(2,5)*w[6]);

          b(2,3) = d*(-a(2,0)*w[13]+a(2,1)*w[9]-a(2,3)*w[5]+a(2,4)*w[4]-a(2,5)*w[3]);

          b(3,3) = d*( a(2,0)*w[12]-a(2,1)*w[8]+a(2,2)*w[5]-a(2,4)*w[2]+a(2,5)*w[1]);

          b(4,3) = d*(-a(2,0)*w[11]+a(2,1)*w[7]-a(2,2)*w[4]+a(2,3)*w[2]-a(2,5)*w[0]);

          b(5,3) = d*( a(2,0)*w[10]-a(2,1)*w[6]+a(2,2)*w[3]-a(2,3)*w[1]+a(2,4)*w[0]);

          w[ 0] = a(2,0)*a(3,1)-a(2,1)*a(3,0);

          w[ 1] = a(2,0)*a(3,2)-a(2,2)*a(3,0);

          w[ 2] = a(2,0)*a(3,3)-a(2,3)*a(3,0);

          w[ 3] = a(2,0)*a(3,4)-a(2,4)*a(3,0);

          w[ 4] = a(2,0)*a(3,5)-a(2,5)*a(3,0);

          w[ 5] = a(2,1)*a(3,2)-a(2,2)*a(3,1);

          w[ 6] = a(2,1)*a(3,3)-a(2,3)*a(3,1);

          w[ 7] = a(2,1)*a(3,4)-a(2,4)*a(3,1);

          w[ 8] = a(2,1)*a(3,5)-a(2,5)*a(3,1);

          w[ 9] = a(2,2)*a(3,3)-a(2,3)*a(3,2);

          w[10] = a(2,2)*a(3,4)-a(2,4)*a(3,2);

          w[11] = a(2,2)*a(3,5)-a(2,5)*a(3,2);

          w[12] = a(2,3)*a(3,4)-a(2,4)*a(3,3);

          w[13] = a(2,3)*a(3,5)-a(2,5)*a(3,3);

          w[14] = a(2,4)*a(3,5)-a(2,5)*a(3,4);

          w[15] = a(1,0)*w[5]-a(1,1)*w[1]+a(1,2)*w[0];

          w[16] = a(1,0)*w[6]-a(1,1)*w[2]+a(1,3)*w[0];

          w[17] = a(1,0)*w[7]-a(1,1)*w[3]+a(1,4)*w[0];

          w[18] = a(1,0)*w[8]-a(1,1)*w[4]+a(1,5)*w[0];

          w[19] = a(1,0)*w[9]-a(1,2)*w[2]+a(1,3)*w[1];

          w[20] = a(1,0)*w[10]-a(1,2)*w[3]+a(1,4)*w[1];

          w[21] = a(1,0)*w[11]-a(1,2)*w[4]+a(1,5)*w[1];

          w[22] = a(1,0)*w[12]-a(1,3)*w[3]+a(1,4)*w[2];

          w[23] = a(1,0)*w[13]-a(1,3)*w[4]+a(1,5)*w[2];

          w[24] = a(1,0)*w[14]-a(1,4)*w[4]+a(1,5)*w[3];

          w[25] = a(1,1)*w[9]-a(1,2)*w[6]+a(1,3)*w[5];

          w[26] = a(1,1)*w[10]-a(1,2)*w[7]+a(1,4)*w[5];

          w[27] = a(1,1)*w[11]-a(1,2)*w[8]+a(1,5)*w[5];

          w[28] = a(1,1)*w[12]-a(1,3)*w[7]+a(1,4)*w[6];

          w[29] = a(1,1)*w[13]-a(1,3)*w[8]+a(1,5)*w[6];

          w[30] = a(1,1)*w[14]-a(1,4)*w[8]+a(1,5)*w[7];

          w[31] = a(1,2)*w[12]-a(1,3)*w[10]+a(1,4)*w[9];

          w[32] = a(1,2)*w[13]-a(1,3)*w[11]+a(1,5)*w[9];

          w[33] = a(1,2)*w[14]-a(1,4)*w[11]+a(1,5)*w[10];

          w[34] = a(1,3)*w[14]-a(1,4)*w[13]+a(1,5)*w[12];

          w[ 0] = a(0,0)*w[25]-a(0,1)*w[19]+a(0,2)*w[16]-a(0,3)*w[15];

          w[ 1] = a(0,0)*w[26]-a(0,1)*w[20]+a(0,2)*w[17]-a(0,4)*w[15];

          w[ 2] = a(0,0)*w[27]-a(0,1)*w[21]+a(0,2)*w[18]-a(0,5)*w[15];

          w[ 3] = a(0,0)*w[28]-a(0,1)*w[22]+a(0,3)*w[17]-a(0,4)*w[16];

          w[ 4] = a(0,0)*w[29]-a(0,1)*w[23]+a(0,3)*w[18]-a(0,5)*w[16];

          w[ 5] = a(0,0)*w[30]-a(0,1)*w[24]+a(0,4)*w[18]-a(0,5)*w[17];

          w[ 6] = a(0,0)*w[31]-a(0,2)*w[22]+a(0,3)*w[20]-a(0,4)*w[19];

          w[ 7] = a(0,0)*w[32]-a(0,2)*w[23]+a(0,3)*w[21]-a(0,5)*w[19];

          w[ 8] = a(0,0)*w[33]-a(0,2)*w[24]+a(0,4)*w[21]-a(0,5)*w[20];

          w[ 9] = a(0,0)*w[34]-a(0,3)*w[24]+a(0,4)*w[23]-a(0,5)*w[22];

          w[10] = a(0,1)*w[31]-a(0,2)*w[28]+a(0,3)*w[26]-a(0,4)*w[25];

          w[11] = a(0,1)*w[32]-a(0,2)*w[29]+a(0,3)*w[27]-a(0,5)*w[25];

          w[12] = a(0,1)*w[33]-a(0,2)*w[30]+a(0,4)*w[27]-a(0,5)*w[26];

          w[13] = a(0,1)*w[34]-a(0,3)*w[30]+a(0,4)*w[29]-a(0,5)*w[28];

          w[14] = a(0,2)*w[34]-a(0,3)*w[33]+a(0,4)*w[32]-a(0,5)*w[31];

          b(0,4) = d*( a(5,1)*w[14]-a(5,2)*w[13]+a(5,3)*w[12]-a(5,4)*w[11]+a(5,5)*w[10]);

          b(1,4) = d*(-a(5,0)*w[14]+a(5,2)*w[9]-a(5,3)*w[8]+a(5,4)*w[7]-a(5,5)*w[6]);

          b(2,4) = d*( a(5,0)*w[13]-a(5,1)*w[9]+a(5,3)*w[5]-a(5,4)*w[4]+a(5,5)*w[3]);

          b(3,4) = d*(-a(5,0)*w[12]+a(5,1)*w[8]-a(5,2)*w[5]+a(5,4)*w[2]-a(5,5)*w[1]);

          b(4,4) = d*( a(5,0)*w[11]-a(5,1)*w[7]+a(5,2)*w[4]-a(5,3)*w[2]+a(5,5)*w[0]);

          b(5,4) = d*(-a(5,0)*w[10]+a(5,1)*w[6]-a(5,2)*w[3]+a(5,3)*w[1]-a(5,4)*w[0]);

          b(0,5) = d*(-a(4,1)*w[14]+a(4,2)*w[13]-a(4,3)*w[12]+a(4,4)*w[11]-a(4,5)*w[10]);

          b(1,5) = d*( a(4,0)*w[14]-a(4,2)*w[9]+a(4,3)*w[8]-a(4,4)*w[7]+a(4,5)*w[6]);

          b(2,5) = d*(-a(4,0)*w[13]+a(4,1)*w[9]-a(4,3)*w[5]+a(4,4)*w[4]-a(4,5)*w[3]);

          b(3,5) = d*( a(4,0)*w[12]-a(4,1)*w[8]+a(4,2)*w[5]-a(4,4)*w[2]+a(4,5)*w[1]);

          b(4,5) = d*(-a(4,0)*w[11]+a(4,1)*w[7]-a(4,2)*w[4]+a(4,3)*w[2]-a(4,5)*w[0]);

          b(5,5) = d*( a(4,0)*w[10]-a(4,1)*w[6]+a(4,2)*w[3]-a(4,3)*w[1]+a(4,4)*w[0]);

          return det;

        }

      };


      // generic square matrix inversion:

      template<int n_>

      struct InverseHelper<n_, n_>

      {

        template<typename T_, int smb_, int snb_, int sma_, int sna_>

        CUDA_HOST_DEVICE static T_ compute(Tiny::Matrix<T_, n_, n_, smb_, snb_>& b, const Tiny::Matrix<T_, n_, n_, sma_, sna_>& a)

        {

          // copy matrix a to b

          for (int i(0); i < n_; ++i)

          {

            for (int j(0); j < n_; ++j)

            {

              b(i,j) = a(i,j);

            }

          }


          // create pivot array

          int p[n_];


          // initialize identity permutation

          for(int i(0); i < n_; ++i)

          {

            p[i] = i;

          }


          // initialize determinant to 1

          T_ det = T_(1);


          // primary column elimination loop

          for(int k(0); k < n_; ++k)

          {

            // step 1: find a pivot for the elimination of column k

            {

              // for this, we only check the rows p[j] with j >= k, as all

              // rows p[j] with j < k have already been eliminated and are

              // therefore not candidates for pivoting

              #ifdef __CUDACC__

              T_ pivot = CudaMath::cuda_abs(b(p[k], p[k]));

              #else

              T_ pivot = Math::abs(b(p[k], p[k]));

              #endif

              int i = k;


              // loop over all unprocessed rows

              for(int j(k+1); j < n_; ++j)

              {

                // get our matrix value and check whether it can be a pivot

                #ifdef __CUDACC__

                T_ abs_val = CudaMath::cuda_abs(b(p[j], p[j]));

                #else

                T_ abs_val = Math::abs(b(p[j], p[j]));

                #endif

                if(abs_val > pivot)

                {

                  pivot = abs_val;

                  i = j;

                }

              }


              // do we have to swap rows i and k?

              if(i > k)

              {

                // swap rows "virtually" by exchanging their permutation positions

                int t = p[k];

                p[k] = p[i];

                p[i] = t;

              }

            }


            // step 2: process pivot row

            {

              // update determinant by multiplying with the pivot element

              det *= b(p[k], p[k]);


              // get our inverted pivot element

              const T_ pivot = T_(1) / b(p[k], p[k]);


              // replace column entry by unit column entry

              b(p[k], p[k]) = T_(1);


              // divide the whole row by the inverted pivot

              for(int j(0); j < n_; ++j)

              {

                b(p[k], j) *= pivot;

              }

            }


            // step 3: eliminate pivot column


            // loop over all rows of the matrix

            for(int i(0); i < n_; ++i)

            {

              // skip the pivot row

              if(i == p[k])

                continue;


              // fetch elimination factor

              const T_ factor =  b(i, p[k]);


              // replace by unit column entry

              b(i, p[k]) = T_(0);


              // process the row

              for(int j(0); j < n_; ++j)

              {

                b(i, j) -= b(p[k], j) * factor;

              }

            }

          }


          // return determinant

          return det;

        }

      };


      #ifdef __CUDACC__

      template<>

      struct CudaGroupedInverseHelper<1,1>

      {

        template<typename T_, typename ThreadGroup_, int smb_, int snb_, int sma_, int sna_>

        CUDA_DEVICE static __forceinline__ void compute(const ThreadGroup_& tg,  Tiny::Matrix<T_, 1, 1, smb_, snb_>& b, const Tiny::Matrix<T_, 1, 1, sma_, sna_>&, const T_& det)

        {

          if(tg.thread_rank() == 0)

            b(0,0) = T_(1) / det;

        }

      };


      template<>

      struct CudaGroupedInverseHelper<2,2>

      {

        template<typename T_, typename ThreadGroup_, int smb_, int snb_, int sma_, int sna_>

        CUDA_DEVICE static __forceinline__ void compute(const ThreadGroup_& tg,  Tiny::Matrix<T_, 2, 2, smb_, snb_>& b, const Tiny::Matrix<T_, 2, 2, sma_, sna_>& a, const T_& det)

        {

          //i think an if else cascade could do better than heavy modulo operations...

          T_ d = T_(1) / det;


          for(int idx = tg.thread_rank(); idx < 4; idx += tg.num_threads())

          {

            const int i = idx /2;

            const int j = idx %2;

            b(i,0) = ((i+j)==1 ? T_(-1) : T_(1)) * d * a(1-j,1-i);


          }

          b(0,0) =  d*a(1,1);

          b(0,1) = -d*a(0,1);

          b(1,0) = -d*a(1,0);

          b(1,1) =  d*a(0,0);

        }

      };


      template<>

      struct CudaGroupedInverseHelper<3,3>

      {

        template<typename T_, typename ThreadGroup_, int smb_, int snb_, int sma_, int sna_>

        CUDA_DEVICE static __forceinline__ void compute(const ThreadGroup_& tg, Tiny::Matrix<T_, 3, 3, smb_, snb_>& b, const Tiny::Matrix<T_, 3, 3, sma_, sna_>& a, const T_& det)

        {

          // for(int i = tg.thread_rank(); i < 3; i += tg.num_threads())

          // {

          //   b[i*snb_+0] = a[1*sna_+1+i-3*(i/2)]*a[2*sna_+(2*(1-i+i/2))] - a[1*sna_+(2*(1-i+i/2))]*a[2*sna_+1+i-3*(i/2)];

          // }

          // tg.sync();

          // T_ det = a[0*sna_+0]*b[0*snb_+0] + a[0*sna_+1]*b[1*snb_+0] + a[0*sna_+2]*b[2*snb_+0];

          // T_ d = T_(1) / det;

          // for(int idx = tg.thread_rank(); idx < 3; idx += tg.num_threads())

          // {

          //   const int i = idx/3;

          //   const int j = idx%3;

          //   b[i*snb_+j] = d*(a[()*sna_+1]*a[2*sna_+2] - a[1*sna_+2]*a[2*sna_+1])


          // }

          //i think an if else cascade could do better than heavy modulo operations...

          T_ d = T_(1) / det;


          for(int idx = tg.thread_rank(); idx < 9; idx += tg.num_threads())

          {

            if(idx == 0)

              b(0,0) = d*(a(1,1)*a(2,2) - a(1,2)*a(2,1));

            else if(idx == 3)

              b(1,0) = d*(a(1,2)*a(2,0) - a(1,0)*a(2,2));

            else if(idx == 6)

              b(2,0) = d*(a(1,0)*a(2,1) - a(1,1)*a(2,0));

            else if(idx == 1)

              b(0,1) = d*(a(0,2)*a(2,1) - a(0,1)*a(2,2));

            else if(idx == 4)

              b(1,1) = d*(a(0,0)*a(2,2) - a(0,2)*a(2,0));

            else if(idx == 7)

              b(2,1) = d*(a(0,1)*a(2,0) - a(0,0)*a(2,1));

            else if(idx == 2)

              b(0,2) = d*(a(0,1)*a(1,2) - a(0,2)*a(1,1));

            else if(idx == 5)

              b(1,2) = d*(a(0,2)*a(1,0) - a(0,0)*a(1,2));

            else if(idx == 8)

              b(2,2) = d*(a(0,0)*a(1,1) - a(0,1)*a(1,0));

          }

        }

      };


      // generic square matrix inversion:

      template<int n_>

      struct CudaGroupedInverseHelper<n_, n_>

      {

        template<typename T_, typename ThreadGroup_, int smb_, int snb_, int sma_, int sna_>

        CUDA_DEVICE static __forceinline__ void compute(const ThreadGroup_& tg, Tiny::Matrix<T_, n_, n_, smb_, snb_>& b, const Tiny::Matrix<T_, n_, n_, sma_, sna_>& a, const T_&)

        {

          // copy matrix a to b

          for (int i = tg.thread_rank(); i < n_*n_; i += tg.num_threads())

          {

            const int row = i/n_;

            const int col = i%n_;

            b.v[row][col] = a.v[row][col];

          }


          // create shared pivot array

          __shared__ int p[n_];

          for (int i = tg.thread_rank(); i < n_; i += tg.num_threads())

          {

            p[i] = 0;

          }

          tg.sync();

          // call grouped invert from first warp subgroup

          if(tg.thread_rank() < 32)

          {

            auto a_g = cg::coalesced_threads();


            CudaMath::cuda_grouped_invert_matrix(a_g, n_, snb_, &b.v[0][0], p);

          }

          tg.sync();

        }

      };

      #endif // __CUDACC__


      template<int m_, int n_>

      struct CofactorHelper

      {

        template<typename T_, int smb_, int snb_, int sma_, int sna_>

        CUDA_HOST_DEVICE static void compute(Tiny::Matrix<T_, m_, n_, smb_, snb_>& b, const Tiny::Matrix<T_, m_, n_, sma_, sna_>& a)

        {

          static_assert(m_ == n_, "cofactor computation is only available for square matrices!");


          // compute inverse

          const T_ det = Intern::InverseHelper<m_,n_>::compute(b, a);


          for(int i(0); i < m_; ++i)

          {

            for(int j(0); j <= i; ++j)

            {

              b(i,j) *= det;

            }

            for(int j(i+1); j < n_; ++j)

            {

              std::swap(b(i,j), b(j,i));

              b(i,j) *= det;

            }

          }

        }

      };


      template<>

      struct CofactorHelper<1,1>

      {

        template<typename T_, int smb_, int snb_, int sma_, int sna_>

        CUDA_HOST_DEVICE static void compute(Tiny::Matrix<T_, 1, 1, smb_, snb_>& b, const Tiny::Matrix<T_, 1, 1, sma_, sna_>&)

        {

          b[0] = T_(1);

        }

      };


      template<>

      struct CofactorHelper<2,2>

      {

        template<typename T_, int smb_, int snb_, int sma_, int sna_>

        CUDA_HOST_DEVICE static void compute(Tiny::Matrix<T_, 2, 2, smb_, snb_>& b, const Tiny::Matrix<T_, 2, 2, sma_, sna_>& a)

        {

          b(0,0) =  a(1,1);

          b(0,1) = -a(0,1);

          b(1,0) = -a(1,0);

          b(1,1) =  a(0,0);

        }

      };


      template<>

      struct CofactorHelper<3,3>

      {

        template<typename T_, int smb_, int snb_, int sma_, int sna_>

        CUDA_HOST_DEVICE static void compute(Tiny::Matrix<T_, 3, 3, smb_, snb_>& b, const Tiny::Matrix<T_, 3, 3, sma_, sna_>& a)

        {

          b(0,0) = a(1,1)*a(2,2) - a(1,2)*a(2,1);

          b(1,0) = a(1,2)*a(2,0) - a(1,0)*a(2,2);

          b(2,0) = a(1,0)*a(2,1) - a(1,1)*a(2,0);

          b(0,1) = a(0,2)*a(2,1) - a(0,1)*a(2,2);

          b(1,1) = a(0,0)*a(2,2) - a(0,2)*a(2,0);

          b(2,1) = a(0,1)*a(2,0) - a(0,0)*a(2,1);

          b(0,2) = a(0,1)*a(1,2) - a(0,2)*a(1,1);

          b(1,2) = a(0,2)*a(1,0) - a(0,0)*a(1,2);

          b(2,2) = a(0,0)*a(1,1) - a(0,1)*a(1,0);

        }

      };


      template<>

      struct CofactorHelper<4,4>

      {

        template<typename T_, int smb_, int snb_, int sma_, int sna_>

        CUDA_HOST_DEVICE static void compute(Tiny::Matrix<T_, 4, 4, smb_, snb_>& b, const Tiny::Matrix<T_, 4, 4, sma_, sna_>& a)

        {

          T_ w[6];

          w[0] = a(2,0)*a(3,1)-a(2,1)*a(3,0);

          w[1] = a(2,0)*a(3,2)-a(2,2)*a(3,0);

          w[2] = a(2,0)*a(3,3)-a(2,3)*a(3,0);

          w[3] = a(2,1)*a(3,2)-a(2,2)*a(3,1);

          w[4] = a(2,1)*a(3,3)-a(2,3)*a(3,1);

          w[5] = a(2,2)*a(3,3)-a(2,3)*a(3,2);


          b(0,0) = a(1,1)*w[5]-a(1,2)*w[4]+a(1,3)*w[3];

          b(1,0) =-a(1,0)*w[5]+a(1,2)*w[2]-a(1,3)*w[1];

          b(2,0) = a(1,0)*w[4]-a(1,1)*w[2]+a(1,3)*w[0];

          b(3,0) =-a(1,0)*w[3]+a(1,1)*w[1]-a(1,2)*w[0];


          b(0,1) =-a(0,1)*w[5]+a(0,2)*w[4]-a(0,3)*w[3];

          b(1,1) = a(0,0)*w[5]-a(0,2)*w[2]+a(0,3)*w[1];

          b(2,1) =-a(0,0)*w[4]+a(0,1)*w[2]-a(0,3)*w[0];

          b(3,1) = a(0,0)*w[3]-a(0,1)*w[1]+a(0,2)*w[0];


          w[0] = a(0,0)*a(1,1)-a(0,1)*a(1,0);

          w[1] = a(0,0)*a(1,2)-a(0,2)*a(1,0);

          w[2] = a(0,0)*a(1,3)-a(0,3)*a(1,0);

          w[3] = a(0,1)*a(1,2)-a(0,2)*a(1,1);

          w[4] = a(0,1)*a(1,3)-a(0,3)*a(1,1);

          w[5] = a(0,2)*a(1,3)-a(0,3)*a(1,2);


          b(0,2) = a(3,1)*w[5]-a(3,2)*w[4]+a(3,3)*w[3];

          b(1,2) =-a(3,0)*w[5]+a(3,2)*w[2]-a(3,3)*w[1];

          b(2,2) = a(3,0)*w[4]-a(3,1)*w[2]+a(3,3)*w[0];

          b(3,2) =-a(3,0)*w[3]+a(3,1)*w[1]-a(3,2)*w[0];


          b(0,3) =-a(2,1)*w[5]+a(2,2)*w[4]-a(2,3)*w[3];

          b(1,3) = a(2,0)*w[5]-a(2,2)*w[2]+a(2,3)*w[1];

          b(2,3) =-a(2,0)*w[4]+a(2,1)*w[2]-a(2,3)*w[0];

          b(3,3) = a(2,0)*w[3]-a(2,1)*w[1]+a(2,2)*w[0];

        }

      };


      template<>

      struct CofactorHelper<5,5>

      {

        template<typename T_, int smb_, int snb_, int sma_, int sna_>

        CUDA_HOST_DEVICE static void compute(Tiny::Matrix<T_, 5, 5, smb_, snb_>& b, const Tiny::Matrix<T_, 5, 5, sma_, sna_>& a)

        {

          T_ w[20];

          w[ 0] = a(3,0)*a(4,1)-a(3,1)*a(4,0);

          w[ 1] = a(3,0)*a(4,2)-a(3,2)*a(4,0);

          w[ 2] = a(3,0)*a(4,3)-a(3,3)*a(4,0);

          w[ 3] = a(3,0)*a(4,4)-a(3,4)*a(4,0);

          w[ 4] = a(3,1)*a(4,2)-a(3,2)*a(4,1);

          w[ 5] = a(3,1)*a(4,3)-a(3,3)*a(4,1);

          w[ 6] = a(3,1)*a(4,4)-a(3,4)*a(4,1);

          w[ 7] = a(3,2)*a(4,3)-a(3,3)*a(4,2);

          w[ 8] = a(3,2)*a(4,4)-a(3,4)*a(4,2);

          w[ 9] = a(3,3)*a(4,4)-a(3,4)*a(4,3);

          w[10] = a(2,0)*w[4]-a(2,1)*w[1]+a(2,2)*w[0];

          w[11] = a(2,0)*w[5]-a(2,1)*w[2]+a(2,3)*w[0];

          w[12] = a(2,0)*w[6]-a(2,1)*w[3]+a(2,4)*w[0];

          w[13] = a(2,0)*w[7]-a(2,2)*w[2]+a(2,3)*w[1];

          w[14] = a(2,0)*w[8]-a(2,2)*w[3]+a(2,4)*w[1];

          w[15] = a(2,0)*w[9]-a(2,3)*w[3]+a(2,4)*w[2];

          w[16] = a(2,1)*w[7]-a(2,2)*w[5]+a(2,3)*w[4];

          w[17] = a(2,1)*w[8]-a(2,2)*w[6]+a(2,4)*w[4];

          w[18] = a(2,1)*w[9]-a(2,3)*w[6]+a(2,4)*w[5];

          w[19] = a(2,2)*w[9]-a(2,3)*w[8]+a(2,4)*w[7];


          b(0,0) = a(1,1)*w[19]-a(1,2)*w[18]+a(1,3)*w[17]-a(1,4)*w[16];

          b(1,0) =-a(1,0)*w[19]+a(1,2)*w[15]-a(1,3)*w[14]+a(1,4)*w[13];

          b(2,0) = a(1,0)*w[18]-a(1,1)*w[15]+a(1,3)*w[12]-a(1,4)*w[11];

          b(3,0) =-a(1,0)*w[17]+a(1,1)*w[14]-a(1,2)*w[12]+a(1,4)*w[10];

          b(4,0) = a(1,0)*w[16]-a(1,1)*w[13]+a(1,2)*w[11]-a(1,3)*w[10];


          b(0,1) =-a(0,1)*w[19]+a(0,2)*w[18]-a(0,3)*w[17]+a(0,4)*w[16];

          b(1,1) = a(0,0)*w[19]-a(0,2)*w[15]+a(0,3)*w[14]-a(0,4)*w[13];

          b(2,1) =-a(0,0)*w[18]+a(0,1)*w[15]-a(0,3)*w[12]+a(0,4)*w[11];

          b(3,1) = a(0,0)*w[17]-a(0,1)*w[14]+a(0,2)*w[12]-a(0,4)*w[10];

          b(4,1) =-a(0,0)*w[16]+a(0,1)*w[13]-a(0,2)*w[11]+a(0,3)*w[10];


          w[10] = a(1,0)*w[4]-a(1,1)*w[1]+a(1,2)*w[0];

          w[11] = a(1,0)*w[5]-a(1,1)*w[2]+a(1,3)*w[0];

          w[12] = a(1,0)*w[6]-a(1,1)*w[3]+a(1,4)*w[0];

          w[13] = a(1,0)*w[7]-a(1,2)*w[2]+a(1,3)*w[1];

          w[14] = a(1,0)*w[8]-a(1,2)*w[3]+a(1,4)*w[1];

          w[15] = a(1,0)*w[9]-a(1,3)*w[3]+a(1,4)*w[2];

          w[16] = a(1,1)*w[7]-a(1,2)*w[5]+a(1,3)*w[4];

          w[17] = a(1,1)*w[8]-a(1,2)*w[6]+a(1,4)*w[4];

          w[18] = a(1,1)*w[9]-a(1,3)*w[6]+a(1,4)*w[5];

          w[19] = a(1,2)*w[9]-a(1,3)*w[8]+a(1,4)*w[7];


          b(0,2) = a(0,1)*w[19]-a(0,2)*w[18]+a(0,3)*w[17]-a(0,4)*w[16];

          b(1,2) =-a(0,0)*w[19]+a(0,2)*w[15]-a(0,3)*w[14]+a(0,4)*w[13];

          b(2,2) = a(0,0)*w[18]-a(0,1)*w[15]+a(0,3)*w[12]-a(0,4)*w[11];

          b(3,2) =-a(0,0)*w[17]+a(0,1)*w[14]-a(0,2)*w[12]+a(0,4)*w[10];

          b(4,2) = a(0,0)*w[16]-a(0,1)*w[13]+a(0,2)*w[11]-a(0,3)*w[10];


          w[ 0] = a(0,0)*a(1,1)-a(0,1)*a(1,0);

          w[ 1] = a(0,0)*a(1,2)-a(0,2)*a(1,0);

          w[ 2] = a(0,0)*a(1,3)-a(0,3)*a(1,0);

          w[ 3] = a(0,0)*a(1,4)-a(0,4)*a(1,0);

          w[ 4] = a(0,1)*a(1,2)-a(0,2)*a(1,1);

          w[ 5] = a(0,1)*a(1,3)-a(0,3)*a(1,1);

          w[ 6] = a(0,1)*a(1,4)-a(0,4)*a(1,1);

          w[ 7] = a(0,2)*a(1,3)-a(0,3)*a(1,2);

          w[ 8] = a(0,2)*a(1,4)-a(0,4)*a(1,2);

          w[ 9] = a(0,3)*a(1,4)-a(0,4)*a(1,3);

          w[10] = a(2,0)*w[4]-a(2,1)*w[1]+a(2,2)*w[0];

          w[11] = a(2,0)*w[5]-a(2,1)*w[2]+a(2,3)*w[0];

          w[12] = a(2,0)*w[6]-a(2,1)*w[3]+a(2,4)*w[0];

          w[13] = a(2,0)*w[7]-a(2,2)*w[2]+a(2,3)*w[1];

          w[14] = a(2,0)*w[8]-a(2,2)*w[3]+a(2,4)*w[1];

          w[15] = a(2,0)*w[9]-a(2,3)*w[3]+a(2,4)*w[2];

          w[16] = a(2,1)*w[7]-a(2,2)*w[5]+a(2,3)*w[4];

          w[17] = a(2,1)*w[8]-a(2,2)*w[6]+a(2,4)*w[4];

          w[18] = a(2,1)*w[9]-a(2,3)*w[6]+a(2,4)*w[5];

          w[19] = a(2,2)*w[9]-a(2,3)*w[8]+a(2,4)*w[7];


          b(0,3) =  a(4,1)*w[19]-a(4,2)*w[18]+a(4,3)*w[17]-a(4,4)*w[16];

          b(1,3) = -a(4,0)*w[19]+a(4,2)*w[15]-a(4,3)*w[14]+a(4,4)*w[13];

          b(2,3) =  a(4,0)*w[18]-a(4,1)*w[15]+a(4,3)*w[12]-a(4,4)*w[11];

          b(3,3) = -a(4,0)*w[17]+a(4,1)*w[14]-a(4,2)*w[12]+a(4,4)*w[10];

          b(4,3) =  a(4,0)*w[16]-a(4,1)*w[13]+a(4,2)*w[11]-a(4,3)*w[10];


          b(0,4) = -a(3,1)*w[19]+a(3,2)*w[18]-a(3,3)*w[17]+a(3,4)*w[16];

          b(1,4) =  a(3,0)*w[19]-a(3,2)*w[15]+a(3,3)*w[14]-a(3,4)*w[13];

          b(2,4) = -a(3,0)*w[18]+a(3,1)*w[15]-a(3,3)*w[12]+a(3,4)*w[11];

          b(3,4) =  a(3,0)*w[17]-a(3,1)*w[14]+a(3,2)*w[12]-a(3,4)*w[10];

          b(4,4) = -a(3,0)*w[16]+a(3,1)*w[13]-a(3,2)*w[11]+a(3,3)*w[10];

        }

      };


      template<>

      struct CofactorHelper<6,6>

      {

        template<typename T_, int smb_, int snb_, int sma_, int sna_>

        CUDA_HOST_DEVICE static void compute(Tiny::Matrix<T_, 6, 6, smb_, snb_>& b, const Tiny::Matrix<T_, 6, 6, sma_, sna_>& a)

        {

          T_ w[35];

          w[ 0] = a(4,0)*a(5,1)-a(4,1)*a(5,0);

          w[ 1] = a(4,0)*a(5,2)-a(4,2)*a(5,0);

          w[ 2] = a(4,0)*a(5,3)-a(4,3)*a(5,0);

          w[ 3] = a(4,0)*a(5,4)-a(4,4)*a(5,0);

          w[ 4] = a(4,0)*a(5,5)-a(4,5)*a(5,0);

          w[ 5] = a(4,1)*a(5,2)-a(4,2)*a(5,1);

          w[ 6] = a(4,1)*a(5,3)-a(4,3)*a(5,1);

          w[ 7] = a(4,1)*a(5,4)-a(4,4)*a(5,1);

          w[ 8] = a(4,1)*a(5,5)-a(4,5)*a(5,1);

          w[ 9] = a(4,2)*a(5,3)-a(4,3)*a(5,2);

          w[10] = a(4,2)*a(5,4)-a(4,4)*a(5,2);

          w[11] = a(4,2)*a(5,5)-a(4,5)*a(5,2);

          w[12] = a(4,3)*a(5,4)-a(4,4)*a(5,3);

          w[13] = a(4,3)*a(5,5)-a(4,5)*a(5,3);

          w[14] = a(4,4)*a(5,5)-a(4,5)*a(5,4);

          w[15] = a(3,0)*w[5]-a(3,1)*w[1]+a(3,2)*w[0];

          w[16] = a(3,0)*w[6]-a(3,1)*w[2]+a(3,3)*w[0];

          w[17] = a(3,0)*w[7]-a(3,1)*w[3]+a(3,4)*w[0];

          w[18] = a(3,0)*w[8]-a(3,1)*w[4]+a(3,5)*w[0];

          w[19] = a(3,0)*w[9]-a(3,2)*w[2]+a(3,3)*w[1];

          w[20] = a(3,0)*w[10]-a(3,2)*w[3]+a(3,4)*w[1];

          w[21] = a(3,0)*w[11]-a(3,2)*w[4]+a(3,5)*w[1];

          w[22] = a(3,0)*w[12]-a(3,3)*w[3]+a(3,4)*w[2];

          w[23] = a(3,0)*w[13]-a(3,3)*w[4]+a(3,5)*w[2];

          w[24] = a(3,0)*w[14]-a(3,4)*w[4]+a(3,5)*w[3];

          w[25] = a(3,1)*w[9]-a(3,2)*w[6]+a(3,3)*w[5];

          w[26] = a(3,1)*w[10]-a(3,2)*w[7]+a(3,4)*w[5];

          w[27] = a(3,1)*w[11]-a(3,2)*w[8]+a(3,5)*w[5];

          w[28] = a(3,1)*w[12]-a(3,3)*w[7]+a(3,4)*w[6];

          w[29] = a(3,1)*w[13]-a(3,3)*w[8]+a(3,5)*w[6];

          w[30] = a(3,1)*w[14]-a(3,4)*w[8]+a(3,5)*w[7];

          w[31] = a(3,2)*w[12]-a(3,3)*w[10]+a(3,4)*w[9];

          w[32] = a(3,2)*w[13]-a(3,3)*w[11]+a(3,5)*w[9];

          w[33] = a(3,2)*w[14]-a(3,4)*w[11]+a(3,5)*w[10];

          w[34] = a(3,3)*w[14]-a(3,4)*w[13]+a(3,5)*w[12];


          w[ 0] = a(2,0)*w[25]-a(2,1)*w[19]+a(2,2)*w[16]-a(2,3)*w[15];

          w[ 1] = a(2,0)*w[26]-a(2,1)*w[20]+a(2,2)*w[17]-a(2,4)*w[15];

          w[ 2] = a(2,0)*w[27]-a(2,1)*w[21]+a(2,2)*w[18]-a(2,5)*w[15];

          w[ 3] = a(2,0)*w[28]-a(2,1)*w[22]+a(2,3)*w[17]-a(2,4)*w[16];

          w[ 4] = a(2,0)*w[29]-a(2,1)*w[23]+a(2,3)*w[18]-a(2,5)*w[16];

          w[ 5] = a(2,0)*w[30]-a(2,1)*w[24]+a(2,4)*w[18]-a(2,5)*w[17];

          w[ 6] = a(2,0)*w[31]-a(2,2)*w[22]+a(2,3)*w[20]-a(2,4)*w[19];

          w[ 7] = a(2,0)*w[32]-a(2,2)*w[23]+a(2,3)*w[21]-a(2,5)*w[19];

          w[ 8] = a(2,0)*w[33]-a(2,2)*w[24]+a(2,4)*w[21]-a(2,5)*w[20];

          w[ 9] = a(2,0)*w[34]-a(2,3)*w[24]+a(2,4)*w[23]-a(2,5)*w[22];

          w[10] = a(2,1)*w[31]-a(2,2)*w[28]+a(2,3)*w[26]-a(2,4)*w[25];

          w[11] = a(2,1)*w[32]-a(2,2)*w[29]+a(2,3)*w[27]-a(2,5)*w[25];

          w[12] = a(2,1)*w[33]-a(2,2)*w[30]+a(2,4)*w[27]-a(2,5)*w[26];

          w[13] = a(2,1)*w[34]-a(2,3)*w[30]+a(2,4)*w[29]-a(2,5)*w[28];

          w[14] = a(2,2)*w[34]-a(2,3)*w[33]+a(2,4)*w[32]-a(2,5)*w[31];


          b(0,0) =  a(1,1)*w[14]-a(1,2)*w[13]+a(1,3)*w[12]-a(1,4)*w[11]+a(1,5)*w[10];

          b(1,0) = -a(1,0)*w[14]+a(1,2)*w[9]-a(1,3)*w[8]+a(1,4)*w[7]-a(1,5)*w[6];

          b(2,0) =  a(1,0)*w[13]-a(1,1)*w[9]+a(1,3)*w[5]-a(1,4)*w[4]+a(1,5)*w[3];

          b(3,0) = -a(1,0)*w[12]+a(1,1)*w[8]-a(1,2)*w[5]+a(1,4)*w[2]-a(1,5)*w[1];

          b(4,0) =  a(1,0)*w[11]-a(1,1)*w[7]+a(1,2)*w[4]-a(1,3)*w[2]+a(1,5)*w[0];

          b(5,0) = -a(1,0)*w[10]+a(1,1)*w[6]-a(1,2)*w[3]+a(1,3)*w[1]-a(1,4)*w[0];


          b(0,1) =-a(0,1)*w[14]+a(0,2)*w[13]-a(0,3)*w[12]+a(0,4)*w[11]-a(0,5)*w[10];

          b(1,1) = a(0,0)*w[14]-a(0,2)*w[9]+a(0,3)*w[8]-a(0,4)*w[7]+a(0,5)*w[6];

          b(2,1) =-a(0,0)*w[13]+a(0,1)*w[9]-a(0,3)*w[5]+a(0,4)*w[4]-a(0,5)*w[3];

          b(3,1) = a(0,0)*w[12]-a(0,1)*w[8]+a(0,2)*w[5]-a(0,4)*w[2]+a(0,5)*w[1];

          b(4,1) =-a(0,0)*w[11]+a(0,1)*w[7]-a(0,2)*w[4]+a(0,3)*w[2]-a(0,5)*w[0];

          b(5,1) = a(0,0)*w[10]-a(0,1)*w[6]+a(0,2)*w[3]-a(0,3)*w[1]+a(0,4)*w[0];


          w[ 0] = a(4,0)*a(5,1)-a(4,1)*a(5,0);

          w[ 1] = a(4,0)*a(5,2)-a(4,2)*a(5,0);

          w[ 2] = a(4,0)*a(5,3)-a(4,3)*a(5,0);

          w[ 3] = a(4,0)*a(5,4)-a(4,4)*a(5,0);

          w[ 4] = a(4,0)*a(5,5)-a(4,5)*a(5,0);

          w[ 5] = a(4,1)*a(5,2)-a(4,2)*a(5,1);

          w[ 6] = a(4,1)*a(5,3)-a(4,3)*a(5,1);

          w[ 7] = a(4,1)*a(5,4)-a(4,4)*a(5,1);

          w[ 8] = a(4,1)*a(5,5)-a(4,5)*a(5,1);

          w[ 9] = a(4,2)*a(5,3)-a(4,3)*a(5,2);

          w[10] = a(4,2)*a(5,4)-a(4,4)*a(5,2);

          w[11] = a(4,2)*a(5,5)-a(4,5)*a(5,2);

          w[12] = a(4,3)*a(5,4)-a(4,4)*a(5,3);

          w[13] = a(4,3)*a(5,5)-a(4,5)*a(5,3);

          w[14] = a(4,4)*a(5,5)-a(4,5)*a(5,4);

          w[15] = a(1,0)*w[5]-a(1,1)*w[1]+a(1,2)*w[0];

          w[16] = a(1,0)*w[6]-a(1,1)*w[2]+a(1,3)*w[0];

          w[17] = a(1,0)*w[7]-a(1,1)*w[3]+a(1,4)*w[0];

          w[18] = a(1,0)*w[8]-a(1,1)*w[4]+a(1,5)*w[0];

          w[19] = a(1,0)*w[9]-a(1,2)*w[2]+a(1,3)*w[1];

          w[20] = a(1,0)*w[10]-a(1,2)*w[3]+a(1,4)*w[1];

          w[21] = a(1,0)*w[11]-a(1,2)*w[4]+a(1,5)*w[1];

          w[22] = a(1,0)*w[12]-a(1,3)*w[3]+a(1,4)*w[2];

          w[23] = a(1,0)*w[13]-a(1,3)*w[4]+a(1,5)*w[2];

          w[24] = a(1,0)*w[14]-a(1,4)*w[4]+a(1,5)*w[3];

          w[25] = a(1,1)*w[9]-a(1,2)*w[6]+a(1,3)*w[5];

          w[26] = a(1,1)*w[10]-a(1,2)*w[7]+a(1,4)*w[5];

          w[27] = a(1,1)*w[11]-a(1,2)*w[8]+a(1,5)*w[5];

          w[28] = a(1,1)*w[12]-a(1,3)*w[7]+a(1,4)*w[6];

          w[29] = a(1,1)*w[13]-a(1,3)*w[8]+a(1,5)*w[6];

          w[30] = a(1,1)*w[14]-a(1,4)*w[8]+a(1,5)*w[7];

          w[31] = a(1,2)*w[12]-a(1,3)*w[10]+a(1,4)*w[9];

          w[32] = a(1,2)*w[13]-a(1,3)*w[11]+a(1,5)*w[9];

          w[33] = a(1,2)*w[14]-a(1,4)*w[11]+a(1,5)*w[10];

          w[34] = a(1,3)*w[14]-a(1,4)*w[13]+a(1,5)*w[12];

          w[ 0] = a(0,0)*w[25]-a(0,1)*w[19]+a(0,2)*w[16]-a(0,3)*w[15];

          w[ 1] = a(0,0)*w[26]-a(0,1)*w[20]+a(0,2)*w[17]-a(0,4)*w[15];

          w[ 2] = a(0,0)*w[27]-a(0,1)*w[21]+a(0,2)*w[18]-a(0,5)*w[15];

          w[ 3] = a(0,0)*w[28]-a(0,1)*w[22]+a(0,3)*w[17]-a(0,4)*w[16];

          w[ 4] = a(0,0)*w[29]-a(0,1)*w[23]+a(0,3)*w[18]-a(0,5)*w[16];

          w[ 5] = a(0,0)*w[30]-a(0,1)*w[24]+a(0,4)*w[18]-a(0,5)*w[17];

          w[ 6] = a(0,0)*w[31]-a(0,2)*w[22]+a(0,3)*w[20]-a(0,4)*w[19];

          w[ 7] = a(0,0)*w[32]-a(0,2)*w[23]+a(0,3)*w[21]-a(0,5)*w[19];

          w[ 8] = a(0,0)*w[33]-a(0,2)*w[24]+a(0,4)*w[21]-a(0,5)*w[20];

          w[ 9] = a(0,0)*w[34]-a(0,3)*w[24]+a(0,4)*w[23]-a(0,5)*w[22];

          w[10] = a(0,1)*w[31]-a(0,2)*w[28]+a(0,3)*w[26]-a(0,4)*w[25];

          w[11] = a(0,1)*w[32]-a(0,2)*w[29]+a(0,3)*w[27]-a(0,5)*w[25];

          w[12] = a(0,1)*w[33]-a(0,2)*w[30]+a(0,4)*w[27]-a(0,5)*w[26];

          w[13] = a(0,1)*w[34]-a(0,3)*w[30]+a(0,4)*w[29]-a(0,5)*w[28];

          w[14] = a(0,2)*w[34]-a(0,3)*w[33]+a(0,4)*w[32]-a(0,5)*w[31];


          b(0,2) = a(3,1)*w[14]-a(3,2)*w[13]+a(3,3)*w[12]-a(3,4)*w[11]+a(3,5)*w[10];

          b(1,2) =-a(3,0)*w[14]+a(3,2)*w[9]-a(3,3)*w[8]+a(3,4)*w[7]-a(3,5)*w[6];

          b(2,2) = a(3,0)*w[13]-a(3,1)*w[9]+a(3,3)*w[5]-a(3,4)*w[4]+a(3,5)*w[3];

          b(3,2) =-a(3,0)*w[12]+a(3,1)*w[8]-a(3,2)*w[5]+a(3,4)*w[2]-a(3,5)*w[1];

          b(4,2) = a(3,0)*w[11]-a(3,1)*w[7]+a(3,2)*w[4]-a(3,3)*w[2]+a(3,5)*w[0];

          b(5,2) =-a(3,0)*w[10]+a(3,1)*w[6]-a(3,2)*w[3]+a(3,3)*w[1]-a(3,4)*w[0];


          b(0,3) =-a(2,1)*w[14]+a(2,2)*w[13]-a(2,3)*w[12]+a(2,4)*w[11]-a(2,5)*w[10];

          b(1,3) = a(2,0)*w[14]-a(2,2)*w[9]+a(2,3)*w[8]-a(2,4)*w[7]+a(2,5)*w[6];

          b(2,3) =-a(2,0)*w[13]+a(2,1)*w[9]-a(2,3)*w[5]+a(2,4)*w[4]-a(2,5)*w[3];

          b(3,3) = a(2,0)*w[12]-a(2,1)*w[8]+a(2,2)*w[5]-a(2,4)*w[2]+a(2,5)*w[1];

          b(4,3) =-a(2,0)*w[11]+a(2,1)*w[7]-a(2,2)*w[4]+a(2,3)*w[2]-a(2,5)*w[0];

          b(5,3) = a(2,0)*w[10]-a(2,1)*w[6]+a(2,2)*w[3]-a(2,3)*w[1]+a(2,4)*w[0];


          w[ 0] = a(2,0)*a(3,1)-a(2,1)*a(3,0);

          w[ 1] = a(2,0)*a(3,2)-a(2,2)*a(3,0);

          w[ 2] = a(2,0)*a(3,3)-a(2,3)*a(3,0);

          w[ 3] = a(2,0)*a(3,4)-a(2,4)*a(3,0);

          w[ 4] = a(2,0)*a(3,5)-a(2,5)*a(3,0);

          w[ 5] = a(2,1)*a(3,2)-a(2,2)*a(3,1);

          w[ 6] = a(2,1)*a(3,3)-a(2,3)*a(3,1);

          w[ 7] = a(2,1)*a(3,4)-a(2,4)*a(3,1);

          w[ 8] = a(2,1)*a(3,5)-a(2,5)*a(3,1);

          w[ 9] = a(2,2)*a(3,3)-a(2,3)*a(3,2);

          w[10] = a(2,2)*a(3,4)-a(2,4)*a(3,2);

          w[11] = a(2,2)*a(3,5)-a(2,5)*a(3,2);

          w[12] = a(2,3)*a(3,4)-a(2,4)*a(3,3);

          w[13] = a(2,3)*a(3,5)-a(2,5)*a(3,3);

          w[14] = a(2,4)*a(3,5)-a(2,5)*a(3,4);

          w[15] = a(1,0)*w[5]-a(1,1)*w[1]+a(1,2)*w[0];

          w[16] = a(1,0)*w[6]-a(1,1)*w[2]+a(1,3)*w[0];

          w[17] = a(1,0)*w[7]-a(1,1)*w[3]+a(1,4)*w[0];

          w[18] = a(1,0)*w[8]-a(1,1)*w[4]+a(1,5)*w[0];

          w[19] = a(1,0)*w[9]-a(1,2)*w[2]+a(1,3)*w[1];

          w[20] = a(1,0)*w[10]-a(1,2)*w[3]+a(1,4)*w[1];

          w[21] = a(1,0)*w[11]-a(1,2)*w[4]+a(1,5)*w[1];

          w[22] = a(1,0)*w[12]-a(1,3)*w[3]+a(1,4)*w[2];

          w[23] = a(1,0)*w[13]-a(1,3)*w[4]+a(1,5)*w[2];

          w[24] = a(1,0)*w[14]-a(1,4)*w[4]+a(1,5)*w[3];

          w[25] = a(1,1)*w[9]-a(1,2)*w[6]+a(1,3)*w[5];

          w[26] = a(1,1)*w[10]-a(1,2)*w[7]+a(1,4)*w[5];

          w[27] = a(1,1)*w[11]-a(1,2)*w[8]+a(1,5)*w[5];

          w[28] = a(1,1)*w[12]-a(1,3)*w[7]+a(1,4)*w[6];

          w[29] = a(1,1)*w[13]-a(1,3)*w[8]+a(1,5)*w[6];

          w[30] = a(1,1)*w[14]-a(1,4)*w[8]+a(1,5)*w[7];

          w[31] = a(1,2)*w[12]-a(1,3)*w[10]+a(1,4)*w[9];

          w[32] = a(1,2)*w[13]-a(1,3)*w[11]+a(1,5)*w[9];

          w[33] = a(1,2)*w[14]-a(1,4)*w[11]+a(1,5)*w[10];

          w[34] = a(1,3)*w[14]-a(1,4)*w[13]+a(1,5)*w[12];


          w[ 0] = a(0,0)*w[25]-a(0,1)*w[19]+a(0,2)*w[16]-a(0,3)*w[15];

          w[ 1] = a(0,0)*w[26]-a(0,1)*w[20]+a(0,2)*w[17]-a(0,4)*w[15];

          w[ 2] = a(0,0)*w[27]-a(0,1)*w[21]+a(0,2)*w[18]-a(0,5)*w[15];

          w[ 3] = a(0,0)*w[28]-a(0,1)*w[22]+a(0,3)*w[17]-a(0,4)*w[16];

          w[ 4] = a(0,0)*w[29]-a(0,1)*w[23]+a(0,3)*w[18]-a(0,5)*w[16];

          w[ 5] = a(0,0)*w[30]-a(0,1)*w[24]+a(0,4)*w[18]-a(0,5)*w[17];

          w[ 6] = a(0,0)*w[31]-a(0,2)*w[22]+a(0,3)*w[20]-a(0,4)*w[19];

          w[ 7] = a(0,0)*w[32]-a(0,2)*w[23]+a(0,3)*w[21]-a(0,5)*w[19];

          w[ 8] = a(0,0)*w[33]-a(0,2)*w[24]+a(0,4)*w[21]-a(0,5)*w[20];

          w[ 9] = a(0,0)*w[34]-a(0,3)*w[24]+a(0,4)*w[23]-a(0,5)*w[22];

          w[10] = a(0,1)*w[31]-a(0,2)*w[28]+a(0,3)*w[26]-a(0,4)*w[25];

          w[11] = a(0,1)*w[32]-a(0,2)*w[29]+a(0,3)*w[27]-a(0,5)*w[25];

          w[12] = a(0,1)*w[33]-a(0,2)*w[30]+a(0,4)*w[27]-a(0,5)*w[26];

          w[13] = a(0,1)*w[34]-a(0,3)*w[30]+a(0,4)*w[29]-a(0,5)*w[28];

          w[14] = a(0,2)*w[34]-a(0,3)*w[33]+a(0,4)*w[32]-a(0,5)*w[31];


          b(0,4) = a(5,1)*w[14]-a(5,2)*w[13]+a(5,3)*w[12]-a(5,4)*w[11]+a(5,5)*w[10];

          b(1,4) =-a(5,0)*w[14]+a(5,2)*w[9]-a(5,3)*w[8]+a(5,4)*w[7]-a(5,5)*w[6];

          b(2,4) = a(5,0)*w[13]-a(5,1)*w[9]+a(5,3)*w[5]-a(5,4)*w[4]+a(5,5)*w[3];

          b(3,4) =-a(5,0)*w[12]+a(5,1)*w[8]-a(5,2)*w[5]+a(5,4)*w[2]-a(5,5)*w[1];

          b(4,4) = a(5,0)*w[11]-a(5,1)*w[7]+a(5,2)*w[4]-a(5,3)*w[2]+a(5,5)*w[0];

          b(5,4) =-a(5,0)*w[10]+a(5,1)*w[6]-a(5,2)*w[3]+a(5,3)*w[1]-a(5,4)*w[0];


          b(0,5) =-a(4,1)*w[14]+a(4,2)*w[13]-a(4,3)*w[12]+a(4,4)*w[11]-a(4,5)*w[10];

          b(1,5) = a(4,0)*w[14]-a(4,2)*w[9]+a(4,3)*w[8]-a(4,4)*w[7]+a(4,5)*w[6];

          b(2,5) =-a(4,0)*w[13]+a(4,1)*w[9]-a(4,3)*w[5]+a(4,4)*w[4]-a(4,5)*w[3];

          b(3,5) = a(4,0)*w[12]-a(4,1)*w[8]+a(4,2)*w[5]-a(4,4)*w[2]+a(4,5)*w[1];

          b(4,5) =-a(4,0)*w[11]+a(4,1)*w[7]-a(4,2)*w[4]+a(4,3)*w[2]-a(4,5)*w[0];

          b(5,5) = a(4,0)*w[10]-a(4,1)*w[6]+a(4,2)*w[3]-a(4,3)*w[1]+a(4,4)*w[0];

        }

      };

    } // namespace Intern

  } // namespace Tiny

} // namespace FEAT

assertion.hpp

XABORTM
#define XABORTM(msg)
Abortion macro definition with custom message.
Definition: assertion.hpp:192

ASSERTM
#define ASSERTM(expr, msg)
Debug-Assertion macro definition with custom message.
Definition: assertion.hpp:230

XASSERTM
#define XASSERTM(expr, msg)
Assertion macro definition with custom message.
Definition: assertion.hpp:263

base_header.hpp
FEAT Kernel base header.

FEAT::Math::Limits
Math Limits class template.
Definition: math.hpp:1475

FEAT::Tiny::Matrix
Tiny Matrix class template.
Definition: tiny_algebra.hpp:80

FEAT::Tiny::Matrix::det
CUDA_HOST_DEVICE DataType det() const
Returns the determinant of the matrix.
Definition: tiny_algebra.hpp:1716

FEAT::Tiny::Matrix::Matrix
CUDA_HOST_DEVICE Matrix(DataType value)
value-assignment constructor
Definition: tiny_algebra.hpp:1319

FEAT::Tiny::Matrix::set_cofactor
CUDA_HOST_DEVICE Matrix & set_cofactor(const Matrix< T_, m_, n_, sma_, sna_ > &a)
Sets this matrix to the cofactor matrix of another matrix.
Definition: tiny_algebra.hpp:1800

FEAT::Tiny::Matrix::add_scalar_main_diag
CUDA_HOST_DEVICE Matrix & add_scalar_main_diag(DataType alpha)
Adds a value onto the matrix's main diagonal.
Definition: tiny_algebra.hpp:1983

FEAT::Tiny::Matrix::add_outer_product
CUDA_HOST_DEVICE Matrix & add_outer_product(const Vector< T_, m_, snx_ > &x, const Vector< T_, n_, sny_ > &y, const DataType alpha=DataType(1))
Adds the outer product of two vectors onto the matrix.
Definition: tiny_algebra.hpp:1909

FEAT::Tiny::Matrix::add_mat_mat_mult
CUDA_HOST_DEVICE Matrix & add_mat_mat_mult(const Matrix< T_, m_, la_, sma_, sna_ > &a, const Matrix< T_, lb_, n_, smb_, snb_ > &b, DataType alpha=DataType(1))
Adds the algebraic matrix-product of two other matrices onto this matrix.
Definition: tiny_algebra.hpp:2009

FEAT::Tiny::Matrix::set_mat_mat_mult
CUDA_HOST_DEVICE Matrix & set_mat_mat_mult(const Matrix< T_, m_, la_, sma_, sna_ > &a, const Matrix< T_, lb_, n_, smb_, snb_ > &b)
Sets this matrix to the algebraic matrix-product of two other matrices.
Definition: tiny_algebra.hpp:2050

FEAT::Tiny::Matrix::norm_frobenius_sqr
CUDA_HOST_DEVICE DataType norm_frobenius_sqr() const
Returns the Frobenius norm squared of the matrix.
Definition: tiny_algebra.hpp:1642

FEAT::Tiny::Matrix::ValueType
T_ ValueType
the data type of the matrix
Definition: tiny_algebra.hpp:1304

FEAT::Tiny::Matrix::operator=
CUDA_HOST_DEVICE Matrix & operator=(const Matrix< T_, m_, n_, sma_, sna_ > &a)
assignment operator
Definition: tiny_algebra.hpp:1394

FEAT::Tiny::Matrix::Matrix
CUDA_HOST_DEVICE Matrix(const std::initializer_list< std::initializer_list< Tx_ > > &x)
Initializer list constructor.
Definition: tiny_algebra.hpp:1374

FEAT::Tiny::Matrix::add_vec_tensor_mult
CUDA_HOST_DEVICE Matrix & add_vec_tensor_mult(const Vector< T_, l_, snv_ > &x, const Tensor3< T_, l_, m_, n_, slt_, smt_, snt_ > &t, DataType alpha=DataType(1))
Adds the result of a vector-tensor left-product onto this matrix.
Definition: tiny_algebra.hpp:2172

FEAT::Tiny::Matrix::set_transpose
CUDA_HOST_DEVICE Matrix & set_transpose(const Matrix< T_, n_, m_, sma_, sna_ > &a)
Sets this matrix to the transpose of another matrix.
Definition: tiny_algebra.hpp:1817

FEAT::Tiny::Matrix::axpy
CUDA_HOST_DEVICE Matrix & axpy(DataType alpha, const Matrix< T_, m_, n_, sma_, sna_ > &a)
Adds another scaled matrix onto this matrix.
Definition: tiny_algebra.hpp:1965

FEAT::Tiny::Matrix::operator-=
CUDA_HOST_DEVICE Matrix & operator-=(const Matrix< T_, m_, n_, sma_, sna_ > &a)
matrix component-wise subtraction operator
Definition: tiny_algebra.hpp:1523

FEAT::Tiny::Matrix::Matrix
CUDA_HOST_DEVICE Matrix(const Matrix< T2_, m_, n_, sma_, sna_ > &a)
copy constructor
Definition: tiny_algebra.hpp:1329

FEAT::Tiny::Matrix::null
static CUDA_HOST_DEVICE Matrix null()
Returns a null-matrix.
Definition: tiny_algebra.hpp:2303

FEAT::Tiny::Matrix::norm_sub_id_frobenius
CUDA_HOST_DEVICE DataType norm_sub_id_frobenius() const
Returns the Frobenius norm of the difference of this matrix and the identity matrix.
Definition: tiny_algebra.hpp:1664

FEAT::Tiny::Matrix::set_inverse
CUDA_HOST_DEVICE Matrix & set_inverse(const Matrix< T_, m_, n_, sma_, sna_ > &a)
Sets this matrix to the inverse of another matrix.
Definition: tiny_algebra.hpp:1748

FEAT::Tiny::Matrix::operator[]
CUDA_HOST_DEVICE const RowType & operator[](int i) const
Row-Access operator.
Definition: tiny_algebra.hpp:1494

FEAT::Tiny::Matrix::operator*=
CUDA_HOST_DEVICE Matrix & operator*=(DataType alpha)
scalar-right-multiply-by operator
Definition: tiny_algebra.hpp:1501

FEAT::Tiny::Matrix::scalar_product
CUDA_HOST_DEVICE DataType scalar_product(const Vector< T_, m_, snx_ > &x, const Vector< T_, n_, sny_ > &y) const
Computes the scalar product of two vectors with this matrix.
Definition: tiny_algebra.hpp:1881

FEAT::Tiny::Matrix::DataType
Intern::DataTypeExtractor< ValueType >::MyDataType DataType
The basic data type buried in the lowest level of the vector.
Definition: tiny_algebra.hpp:1306

FEAT::Tiny::Matrix::RowType
Vector< T_, n_, sn_ > RowType
the type of a single matrix row
Definition: tiny_algebra.hpp:1309

FEAT::Tiny::Matrix::operator()
CUDA_HOST_DEVICE const T_ & operator()(int i, int j) const
Access operator.
Definition: tiny_algebra.hpp:1471

FEAT::Tiny::Matrix::operator=
CUDA_HOST_DEVICE Matrix & operator=(DataType value)
value-assignment operator
Definition: tiny_algebra.hpp:1383

FEAT::Tiny::Matrix::set_rotation_3d
CUDA_HOST_DEVICE Matrix & set_rotation_3d(T_ yaw, T_ pitch, T_ roll)
Sets this matrix to a 3D yaw-pitch-roll rotation matrix.
Definition: tiny_algebra.hpp:2270

FEAT::Tiny::Matrix::trace
CUDA_HOST_DEVICE DataType trace() const
Returns the trace of the matrix.
Definition: tiny_algebra.hpp:1694

FEAT::Tiny::Matrix::norm_hessian_sqr
CUDA_HOST_DEVICE DataType norm_hessian_sqr() const
Returns the Hessian norm square.
Definition: tiny_algebra.hpp:1595

FEAT::Tiny::Matrix::Matrix
CUDA_HOST_DEVICE Matrix(const std::initializer_list< Tx_ > &x)
Initializer list of Tiny::Vector constructor.
Definition: tiny_algebra.hpp:1352

FEAT::Tiny::Matrix::set_gram
CUDA_HOST_DEVICE Matrix & set_gram(const Matrix< T_, l_, n_, sla_, sna_ > &a)
Sets this matrix to the Gram matrix of another matrix.
Definition: tiny_algebra.hpp:1845

FEAT::Tiny::Matrix::set_rotation_2d
CUDA_HOST_DEVICE Matrix & set_rotation_2d(T_ angle)
Sets this matrix to a 2D rotation matrix.
Definition: tiny_algebra.hpp:2243

FEAT::Tiny::Matrix::set_double_mat_mult
CUDA_HOST_DEVICE Matrix & set_double_mat_mult(const Matrix< T_, k_, l_, sma_, sna_ > &a, const Matrix< T_, k_, m_, smb_, snb_ > &b, const Matrix< T_, l_, n_, smd_, snd_ > &d, T_ alpha=T_(1))
Sets this matrix to the algebraic matrix double-product of three other matrices.
Definition: tiny_algebra.hpp:2136

FEAT::Tiny::Matrix::operator()
CUDA_HOST_DEVICE T_ & operator()(int i, int j)
Access operator.
Definition: tiny_algebra.hpp:1463

FEAT::Tiny::Matrix::add_double_mat_mult
CUDA_HOST_DEVICE Matrix & add_double_mat_mult(const Matrix< T_, k_, l_, sma_, sna_ > &a, const Matrix< T_, k_, m_, smb_, snb_ > &b, const Matrix< T_, l_, n_, smd_, snd_ > &d, DataType alpha=DataType(1))
Adds the algebraic matrix double-product of three other matrices onto this matrix.
Definition: tiny_algebra.hpp:2083

FEAT::Tiny::Matrix::convert
CUDA_HOST_DEVICE void convert(const Matrix< Tx_, m_, n_, sma_, sna_ > &a)
conversion operator
Definition: tiny_algebra.hpp:1448

FEAT::Tiny::Matrix::set_identity
CUDA_HOST_DEVICE Matrix & set_identity()
Sets this matrix to the identity matrix.
Definition: tiny_algebra.hpp:2227

FEAT::Tiny::Matrix::operator+=
CUDA_HOST_DEVICE Matrix & operator+=(const Matrix< T_, m_, n_, sma_, sna_ > &a)
matrix component-wise addition operator
Definition: tiny_algebra.hpp:1512

FEAT::Tiny::Matrix::operator[]
CUDA_HOST_DEVICE RowType & operator[](int i)
Row-Access operator.
Definition: tiny_algebra.hpp:1487

FEAT::Tiny::Matrix::operator=
CUDA_HOST_DEVICE Matrix & operator=(const std::initializer_list< std::initializer_list< Tx_ > > &x)
Initializer list assignment operator.
Definition: tiny_algebra.hpp:1437

FEAT::Tiny::Matrix::Matrix
CUDA_HOST_DEVICE Matrix()
default constructor
Definition: tiny_algebra.hpp:1314

FEAT::Tiny::Matrix::copy_n
CUDA_HOST_DEVICE void copy_n(const Matrix< T_, ma_, na_, sma_, sna_ > &a)
Copies the upper left mm_ x nn_ entries of a matrix.
Definition: tiny_algebra.hpp:1559

FEAT::Tiny::Matrix::vol
CUDA_HOST_DEVICE DataType vol() const
Returns the volume of the matrix.
Definition: tiny_algebra.hpp:1732

FEAT::Tiny::Matrix::v
RowType v[sm_]
actual matrix data; that's an array of vectors
Definition: tiny_algebra.hpp:1311

FEAT::Tiny::Matrix::operator=
CUDA_HOST_DEVICE Matrix & operator=(const std::initializer_list< Tx_ > &x)
Initializer list assignment operator.
Definition: tiny_algebra.hpp:1416

FEAT::Tiny::Matrix::set_vec_tensor_mult
CUDA_HOST_DEVICE Matrix & set_vec_tensor_mult(const Vector< T_, l_, snv_ > &x, const Tensor3< T_, l_, m_, n_, slt_, smt_, snt_ > &t, DataType alpha=DataType(1))
Sets this matrix to the result of a vector-tensor left-product.
Definition: tiny_algebra.hpp:2213

FEAT::Tiny::Matrix::set_outer_product
CUDA_HOST_DEVICE Matrix & set_outer_product(const Vector< T_, m_, snx_ > &x, const Vector< T_, n_, sny_ > &y)
Sets this matrix to the outer product of two vectors.
Definition: tiny_algebra.hpp:1939

FEAT::Tiny::Matrix::format
CUDA_HOST_DEVICE void format(DataType alpha=DataType(0))
Formats the matrix.
Definition: tiny_algebra.hpp:1576

FEAT::Tiny::Matrix::copy
CUDA_HOST_DEVICE void copy(const Matrix< T_, m_, n_, sma_, sna_ > &a)
Copies a matrix.
Definition: tiny_algebra.hpp:1539

FEAT::Tiny::Matrix::norm_frobenius
CUDA_HOST_DEVICE DataType norm_frobenius() const
Returns the Frobenius norm of the matrix.
Definition: tiny_algebra.hpp:1625

FEAT::Tiny::Tensor3
Tiny Tensor3 class template.
Definition: tiny_algebra.hpp:116

FEAT::Tiny::Tensor3::add_double_mat_mult
CUDA_HOST_DEVICE Tensor3 & add_double_mat_mult(const Tensor3< T_, lt_, mt_, nt_, slt_, smt_, snt_ > &t, const Matrix< T_, nt_, n_, smb_, snb_ > &b, const Matrix< T_, mt_, m_, smd_, snd_ > &d, DataType alpha=DataType(1))
Adds the result of a matrix-tensor-matrix double-product onto this tensor.
Definition: tiny_algebra.hpp:2812

FEAT::Tiny::Tensor3::DataType
Intern::DataTypeExtractor< ValueType >::MyDataType DataType
The basic data type buried in the lowest level of the vector.
Definition: tiny_algebra.hpp:2497

FEAT::Tiny::Tensor3::copy_n
CUDA_HOST_DEVICE void copy_n(const Tensor3< T_, la_, ma_, na_, sla_, sma_, sna_ > &a)
Copies the upper left mm_ x nn_ entries of a matrix.
Definition: tiny_algebra.hpp:2717

FEAT::Tiny::Tensor3::l
static constexpr int l
the tube count of the tensor
Definition: tiny_algebra.hpp:2482

FEAT::Tiny::Tensor3::operator=
CUDA_HOST_DEVICE Tensor3 & operator=(const std::initializer_list< Tx_ > &x)
Initializer list assignment operator.
Definition: tiny_algebra.hpp:2580

FEAT::Tiny::Tensor3::sn
static constexpr int sn
the column stride of the tensor
Definition: tiny_algebra.hpp:2492

FEAT::Tiny::Tensor3::operator=
CUDA_HOST_DEVICE Tensor3 & operator=(const Tensor3< T_, l_, m_, n_, sla_, sma_, sna_ > &a)
copy-assignment operator
Definition: tiny_algebra.hpp:2564

FEAT::Tiny::Tensor3::format
CUDA_HOST_DEVICE void format(DataType alpha=DataType(0))
formats the tensor
Definition: tiny_algebra.hpp:2732

FEAT::Tiny::Tensor3::operator()
CUDA_HOST_DEVICE const T_ & operator()(int h, int i, int j) const
Access operator.
Definition: tiny_algebra.hpp:2633

FEAT::Tiny::Tensor3::operator=
CUDA_HOST_DEVICE Tensor3 & operator=(DataType value)
value-assignment operator
Definition: tiny_algebra.hpp:2555

FEAT::Tiny::Tensor3::operator=
CUDA_HOST_DEVICE Tensor3 & operator=(const std::initializer_list< std::initializer_list< std::initializer_list< Tx_ > > > &x)
Initializer list assignment operator.
Definition: tiny_algebra.hpp:2598

FEAT::Tiny::Tensor3::add_vec_mat_outer_product
CUDA_HOST_DEVICE Tensor3 & add_vec_mat_outer_product(const Vector< T_, l_, slx_ > &x, const Matrix< T_, m_, n_, sma_, sna_ > &a, DataType alpha=DataType(1))
Adds the result of a vector-matrix outer product onto this tensor.
Definition: tiny_algebra.hpp:2863

FEAT::Tiny::Tensor3::operator+=
CUDA_HOST_DEVICE Tensor3 & operator+=(const Tensor3< T_, l_, m_, n_, sla_, sma_, sna_ > &a)
tensor component-wise addition operator
Definition: tiny_algebra.hpp:2673

FEAT::Tiny::Tensor3::Tensor3
CUDA_HOST_DEVICE Tensor3(const std::initializer_list< std::initializer_list< std::initializer_list< Tx_ > > > &x)
Initializer list constructor.
Definition: tiny_algebra.hpp:2538

FEAT::Tiny::Tensor3::operator[]
CUDA_HOST_DEVICE const PlaneType & operator[](int h) const
Plane-Access operator.
Definition: tiny_algebra.hpp:2657

FEAT::Tiny::Tensor3::Tensor3
CUDA_HOST_DEVICE Tensor3(const Tensor3< T_, l_, m_, n_, sla_, sma_, sna_ > &a)
copy-constructor
Definition: tiny_algebra.hpp:2548

FEAT::Tiny::Tensor3::PlaneType
Matrix< T_, m_, n_, sm_, sn_ > PlaneType
Type of tensor data; that's an array of matrices.
Definition: tiny_algebra.hpp:2500

FEAT::Tiny::Tensor3::sl
static constexpr int sl
the tube stride of the tensor
Definition: tiny_algebra.hpp:2488

FEAT::Tiny::Tensor3::Tensor3
CUDA_HOST_DEVICE Tensor3(const std::initializer_list< Tx_ > &x)
Initializer list constructor.
Definition: tiny_algebra.hpp:2523

FEAT::Tiny::Tensor3::operator[]
CUDA_HOST_DEVICE PlaneType & operator[](int h)
Plane-Access operator.
Definition: tiny_algebra.hpp:2650

FEAT::Tiny::Tensor3::Tensor3
CUDA_HOST_DEVICE Tensor3(DataType value)
value-assignment constructor
Definition: tiny_algebra.hpp:2510

FEAT::Tiny::Tensor3::null
static CUDA_HOST_DEVICE Tensor3 null()
Returns a null-tensor.
Definition: tiny_algebra.hpp:2884

FEAT::Tiny::Tensor3::add_mat_tensor_mult
CUDA_HOST_DEVICE Tensor3 & add_mat_tensor_mult(const Matrix< T_, l_, k_, sma_, sna_ > &a, const Tensor3< T_, k_, m_, n_, slt_, smt_, snt_ > &t, DataType alpha=DataType(1))
Adds the result of a matrix-tensor product onto this tensor.
Definition: tiny_algebra.hpp:2758

FEAT::Tiny::Tensor3::v
PlaneType v[sl_]
Actual tensor data.
Definition: tiny_algebra.hpp:2502

FEAT::Tiny::Tensor3::m
static constexpr int m
the row count of the tensor
Definition: tiny_algebra.hpp:2484

FEAT::Tiny::Tensor3::convert
CUDA_HOST_DEVICE void convert(const Tensor3< Tx_, l_, m_, n_, sla_, sma_, sna_ > &a)
conversion operator
Definition: tiny_algebra.hpp:2609

FEAT::Tiny::Tensor3::Tensor3
CUDA_HOST_DEVICE Tensor3()
default constructor
Definition: tiny_algebra.hpp:2505

FEAT::Tiny::Tensor3::operator-=
CUDA_HOST_DEVICE Tensor3 & operator-=(const Tensor3< T_, l_, m_, n_, sla_, sma_, sna_ > &a)
tensor component-wise subtraction operator
Definition: tiny_algebra.hpp:2682

FEAT::Tiny::Tensor3::operator()
CUDA_HOST_DEVICE T_ & operator()(int h, int i, int j)
Access operator.
Definition: tiny_algebra.hpp:2624

FEAT::Tiny::Tensor3::n
static constexpr int n
the column count of the tensor
Definition: tiny_algebra.hpp:2486

FEAT::Tiny::Tensor3::sm
static constexpr int sm
the row stride of the tensor
Definition: tiny_algebra.hpp:2490

FEAT::Tiny::Tensor3::ValueType
T_ ValueType
the data type of the tensor
Definition: tiny_algebra.hpp:2495

FEAT::Tiny::Tensor3::operator*=
CUDA_HOST_DEVICE Tensor3 & operator*=(DataType alpha)
scalar right-multiply-by operator
Definition: tiny_algebra.hpp:2664

FEAT::Tiny::Tensor3::copy
CUDA_HOST_DEVICE void copy(const Tensor3< T_, l_, m_, n_, sla_, sma_, sna_ > &a)
Copies a tensor3.
Definition: tiny_algebra.hpp:2696

FEAT::Tiny::Vector
Tiny Vector class template.
Definition: tiny_algebra.hpp:52

FEAT::Tiny::Vector::null
static CUDA_HOST_DEVICE Vector null()
Returns a null-vector.
Definition: tiny_algebra.hpp:1161

FEAT::Tiny::Vector::norm_euclid_n
CUDA_HOST_DEVICE DataType norm_euclid_n() const
Computes the euclid norm of first nn_ entries of this vector.
Definition: tiny_algebra.hpp:1064

FEAT::Tiny::Vector::add_mat_vec_mult
CUDA_HOST_DEVICE Vector & add_mat_vec_mult(const Matrix< T_, n_, m_, sma_, sna_ > &a, const Vector< T_, m_, sx_ > &x, DataType alpha=DataType(1))
Adds the result of a matrix-vector product onto this vector.
Definition: tiny_algebra.hpp:861

FEAT::Tiny::Vector::norm_euclid_sqr_n
CUDA_HOST_DEVICE DataType norm_euclid_sqr_n() const
Computes the squared euclid norm of first nn_ entries of this vector.
Definition: tiny_algebra.hpp:1027

FEAT::Tiny::Vector::normalize_n
CUDA_HOST_DEVICE Vector & normalize_n()
Normalizes the first nn_ entries of this vector.
Definition: tiny_algebra.hpp:552

FEAT::Tiny::Vector::operator=
CUDA_HOST_DEVICE Vector & operator=(const Vector< T_, n_, sx_ > &x)
copy-assignment operator
Definition: tiny_algebra.hpp:305

FEAT::Tiny::Vector::v
T_ v[s_]
actual vector data
Definition: tiny_algebra.hpp:225

FEAT::Tiny::Vector::normalized
CUDA_HOST_DEVICE bool normalized() const
Return if the vector is normalized.
Definition: tiny_algebra.hpp:1169

FEAT::Tiny::Vector::format_n
CUDA_HOST_DEVICE void format_n(DataType alpha=DataType(0))
Formats the first nn_ entries of the vector.
Definition: tiny_algebra.hpp:483

FEAT::Tiny::Vector::convert_new
static CUDA_HOST_DEVICE Vector convert_new(Vector &&x)
overload for moveable rvalue type
Definition: tiny_algebra.hpp:267

FEAT::Tiny::Vector::format
CUDA_HOST_DEVICE void format(DataType alpha=DataType(0))
Formats the vector.
Definition: tiny_algebra.hpp:465

FEAT::Tiny::Vector::Vector
CUDA_HOST_DEVICE Vector(const Vector< T_, n_, sx_ > &x)
copy constructor
Definition: tiny_algebra.hpp:243

FEAT::Tiny::Vector::operator=
CUDA_HOST_DEVICE Vector & operator=(const std::initializer_list< Tx_ > &x)
Initializer list assignment operator.
Definition: tiny_algebra.hpp:329

FEAT::Tiny::Vector::norm_max
CUDA_HOST_DEVICE DataType norm_max() const
Computes the max-norm of this vector.
Definition: tiny_algebra.hpp:1122

FEAT::Tiny::Vector::operator()
CUDA_HOST_DEVICE T_ & operator()(int i)
Access operator.
Definition: tiny_algebra.hpp:354

FEAT::Tiny::Vector::operator[]
CUDA_HOST_DEVICE const T_ & operator[](int i) const
Access operator.
Definition: tiny_algebra.hpp:375

FEAT::Tiny::Vector::operator-=
CUDA_HOST_DEVICE Vector & operator-=(const Vector< T_, n_, sx_ > &x)
vector-subtract operator
Definition: tiny_algebra.hpp:450

FEAT::Tiny::Vector::set_vec_mat_mult
CUDA_HOST_DEVICE Vector & set_vec_mat_mult(const Vector< T_, m_, sx_ > &x, const Matrix< T_, m_, n_, sma_, sna_ > &a)
Sets this vector to the result of a vector-matrix product.
Definition: tiny_algebra.hpp:785

FEAT::Tiny::Vector::convert_new
static CUDA_HOST_DEVICE Vector convert_new(const Vector< Tx_, n_, sx_ > &x)
convert function, not callable with non convertable inner type
Definition: tiny_algebra.hpp:253

FEAT::Tiny::Vector::add_vec_mat_mult_n
CUDA_HOST_DEVICE Vector & add_vec_mat_mult_n(const Vector< T_, mx_, smx_ > &x, const Matrix< T_, ma_, na_, sma_, sna_ > &a, DataType alpha=DataType(1))
Adds the result of a vector-matrix product onto this vector.
Definition: tiny_algebra.hpp:980

FEAT::Tiny::Vector::scale_n
CUDA_HOST_DEVICE void scale_n(DataType alpha)
Scales the first nn_ entries of the vector.
Definition: tiny_algebra.hpp:516

FEAT::Tiny::Vector::set_mat_vec_mult_n
CUDA_HOST_DEVICE Vector & set_mat_vec_mult_n(const Matrix< T_, ma_, na_, sma_, sna_ > &a, const Vector< T_, nx_, sx_ > &x)
Sets the first nn_ entries of this vector to the result of a matrix-vector product with the first mm_...
Definition: tiny_algebra.hpp:748

FEAT::Tiny::Vector::n
static constexpr int n
the length of the vector
Definition: tiny_algebra.hpp:215

FEAT::Tiny::Vector::operator*=
CUDA_HOST_DEVICE Vector & operator*=(DataType alpha)
scalar-multiply operator
Definition: tiny_algebra.hpp:417

FEAT::Tiny::Vector::Vector
CUDA_HOST_DEVICE Vector()
default constructor
Definition: tiny_algebra.hpp:228

FEAT::Tiny::Vector::norm_l1
CUDA_HOST_DEVICE DataType norm_l1() const
Computes the l1-norm of this vector.
Definition: tiny_algebra.hpp:1080

FEAT::Tiny::Vector::set_convex
CUDA_HOST_DEVICE Vector & set_convex(DataType alpha, const Vector< T_, n_, sna_ > &a, const Vector< T_, n_, snb_ > &b)
Sets this vector to the convex combination of two other vectors.
Definition: tiny_algebra.hpp:658

FEAT::Tiny::Vector::axpy_n
CUDA_HOST_DEVICE Vector & axpy_n(DataType alpha, const Vector< T_, nx_, snx_ > &x)
Adds the first nn_ entries of another scaled vector onto this vector.
Definition: tiny_algebra.hpp:631

FEAT::Tiny::Vector::add_mat_vec_mult_n
CUDA_HOST_DEVICE Vector & add_mat_vec_mult_n(const Matrix< T_, ma_, na_, sma_, sna_ > &a, const Vector< T_, nx_, sx_ > &x, DataType alpha=DataType(1))
Adds the result of a matrix-vector product onto this vector.
Definition: tiny_algebra.hpp:901

FEAT::Tiny::Vector::operator<<
CUDA_HOST friend std::ostream & operator<<(std::ostream &lhs, const Vector &b)
Tiny::Vector streaming operator.
Definition: tiny_algebra.hpp:1180

FEAT::Tiny::Vector::operator+=
CUDA_HOST_DEVICE Vector & operator+=(const Vector< T_, n_, sx_ > &x)
vector-add operator
Definition: tiny_algebra.hpp:439

FEAT::Tiny::Vector::operator=
CUDA_HOST_DEVICE Vector & operator=(DataType value)
value-assignment operator
Definition: tiny_algebra.hpp:294

FEAT::Tiny::Vector::operator()
CUDA_HOST_DEVICE const T_ & operator()(int i) const
Access operator.
Definition: tiny_algebra.hpp:361

FEAT::Tiny::Vector::axpy
CUDA_HOST_DEVICE Vector & axpy(DataType alpha, const Vector< T_, n_, snx_ > &x)
Adds another scaled vector onto this vector.
Definition: tiny_algebra.hpp:609

FEAT::Tiny::Vector::copy
CUDA_HOST_DEVICE void copy(const Vector< T_, n_, snx_ > &x)
Copies a vector.
Definition: tiny_algebra.hpp:388

FEAT::Tiny::Vector::normalize
CUDA_HOST_DEVICE Vector & normalize()
Normalizes this vector.
Definition: tiny_algebra.hpp:530

FEAT::Tiny::Vector::negate
CUDA_HOST_DEVICE Vector & negate()
Negates the vector, i.e. effectively multiplies all components by -1.
Definition: tiny_algebra.hpp:573

FEAT::Tiny::Vector::DataType
Intern::DataTypeExtractor< ValueType >::MyDataType DataType
The basic data type buried in the lowest level of the vector.
Definition: tiny_algebra.hpp:222

FEAT::Tiny::Vector::Vector
CUDA_HOST_DEVICE Vector(DataType value)
value-assignment constructor
Definition: tiny_algebra.hpp:233

FEAT::Tiny::Vector::norm_max_n
CUDA_HOST_DEVICE DataType norm_max_n() const
Computes the max-norm of the vector.
Definition: tiny_algebra.hpp:1143

FEAT::Tiny::Vector::negate_n
CUDA_HOST_DEVICE Vector & negate_n()
Negates the first nn_ entries of this vector.
Definition: tiny_algebra.hpp:589

FEAT::Tiny::Vector::norm_l1_n
CUDA_HOST_DEVICE DataType norm_l1_n() const
Computes the l1-norm of the first nn_ entries of this vector.
Definition: tiny_algebra.hpp:1101

FEAT::Tiny::Vector::set_convex_n
CUDA_HOST_DEVICE Vector & set_convex_n(DataType alpha, const Vector< T_, na_, sna_ > &a, const Vector< T_, nb_, snb_ > &b)
Sets the first nn_ entries of this vector to the convex combination of two other vectors.
Definition: tiny_algebra.hpp:686

FEAT::Tiny::Vector::norm_euclid_sqr
CUDA_HOST_DEVICE DataType norm_euclid_sqr() const
Computes the squared euclid norm of this vector.
Definition: tiny_algebra.hpp:1006

FEAT::Tiny::Vector::copy_n
CUDA_HOST_DEVICE void copy_n(const Vector< T_, nx_, snx_ > &x)
Copies the first nn_ entries of a vector.
Definition: tiny_algebra.hpp:406

FEAT::Tiny::Vector::scale
CUDA_HOST_DEVICE void scale(DataType alpha)
Scales the vector.
Definition: tiny_algebra.hpp:498

FEAT::Tiny::Vector::Vector
CUDA_HOST_DEVICE Vector(const std::initializer_list< Tx_ > &x)
Initializer list constructor.
Definition: tiny_algebra.hpp:285

FEAT::Tiny::Vector::s
static constexpr int s
the stride of the vector
Definition: tiny_algebra.hpp:217

FEAT::Tiny::Vector::set_mat_vec_mult
CUDA_HOST_DEVICE Vector & set_mat_vec_mult(const Matrix< T_, n_, m_, sma_, sna_ > &a, const Vector< T_, m_, sx_ > &x)
Sets this vector to the result of a matrix-vector product.
Definition: tiny_algebra.hpp:712

FEAT::Tiny::Vector::add_vec_mat_mult
CUDA_HOST_DEVICE Vector & add_vec_mat_mult(const Vector< T_, m_, sx_ > &x, const Matrix< T_, m_, n_, sma_, sna_ > &a, DataType alpha=DataType(1))
Adds the result of a vector-matrix product onto this vector.
Definition: tiny_algebra.hpp:940

FEAT::Tiny::Vector::operator[]
CUDA_HOST_DEVICE T_ & operator[](int i)
Access operator.
Definition: tiny_algebra.hpp:368

FEAT::Tiny::Vector::ValueType
T_ ValueType
the value type of the vector
Definition: tiny_algebra.hpp:220

FEAT::Tiny::Vector::norm_euclid
CUDA_HOST_DEVICE DataType norm_euclid() const
Computes the euclid norm of this vector.
Definition: tiny_algebra.hpp:1048

FEAT::Tiny::Vector::set_vec_mat_mult_n
CUDA_HOST_DEVICE Vector & set_vec_mat_mult_n(const Vector< T_, mx_, smx_ > &x, const Matrix< T_, ma_, na_, sma_, sna_ > &a)
Sets the first mm_ entries of this vector to the result of a vector-matrix product with the first mm_...
Definition: tiny_algebra.hpp:821

FEAT::Tiny::Vector::convert
CUDA_HOST_DEVICE void convert(const Vector< Tx_, n_, sx_ > &x)
conversion operator
Definition: tiny_algebra.hpp:340

FEAT::Tiny::Vector::operator*=
CUDA_HOST_DEVICE Vector & operator*=(const Vector< T_, n_, sx_ > &x)
element-wise-multiply operator
Definition: tiny_algebra.hpp:428

FEAT::Math::sqrt
T_ sqrt(T_ x)
Returns the square-root of a value.
Definition: math.hpp:300

FEAT::Math::abs
T_ abs(T_ x)
Returns the absolute value.
Definition: math.hpp:275

FEAT::Math::calc_opening_angle
DT_ calc_opening_angle(DT_ x1, DT_ x2, DT_ y1, DT_ y2)
Calculates the opening angle of two 2D vectors.
Definition: math.hpp:1450

FEAT::Math::pow
T_ pow(T_ x, T_ y)
Returns x raised to the power of y.
Definition: math.hpp:643

FEAT::Math::sin
T_ sin(T_ x)
Returns the sine of a value.
Definition: math.hpp:344

FEAT::Math::sqr
T_ sqr(T_ x)
Returns the square of a value.
Definition: math.hpp:95

FEAT::Math::min
T_ min(T_ a, T_ b)
Returns the minimum of two values.
Definition: math.hpp:123

FEAT::Math::cos
T_ cos(T_ x)
Returns the cosine of a value.
Definition: math.hpp:386

FEAT::Math::max
T_ max(T_ a, T_ b)
Returns the maximum of two values.
Definition: math.hpp:137

FEAT::Tiny::operator+
CUDA_HOST_DEVICE Vector< T_, n_ > operator+(const Vector< T_, n_, sa_ > &a, const Vector< T_, n_, sb_ > &b)
vector addition operator
Definition: tiny_algebra.hpp:1231

FEAT::Tiny::project_onto
CUDA_HOST Vector< T_, dim_ > project_onto(const Vector< T_, dim_ > &x, const Vector< T_, dim_ > &y)
Calculates the projected vector.
Definition: tiny_algebra.hpp:1269

FEAT::Tiny::dot
CUDA_HOST_DEVICE T_ dot(const T_ &a, const T_ &b)
Computes the dot-product of two scalars.
Definition: tiny_algebra.hpp:2923

FEAT::Tiny::orthogonal_3x2
CUDA_HOST_DEVICE void orthogonal_3x2(Vector< T_, mx_, smx_ > &nu, const Matrix< T_, ma_, na_, sm_, sn_ > &tau)
Computes the positively oriented orthogonal vector to the columns of a 3x2 matrix.
Definition: tiny_algebra.hpp:2357

FEAT::Tiny::axpy
CUDA_HOST_DEVICE void axpy(T_ &y, const T_ &x, const T_ &alpha)
Performs an AXPY of two scalars.
Definition: tiny_algebra.hpp:3072

FEAT::Tiny::orthogonal_2x1
CUDA_HOST_DEVICE void orthogonal_2x1(Vector< T_, mx_, smx_ > &nu, const Matrix< T_, ma_, na_, sm_, sn_ > &tau)
Computes the positively oriented orthogonal vector to the columns of a 2x1 matrix.
Definition: tiny_algebra.hpp:2336

FEAT::Tiny::add_id
CUDA_HOST_DEVICE void add_id(T_ &x, const T_ &alpha)
Adds a scaled identity onto a scalar.
Definition: tiny_algebra.hpp:3004

FEAT::Tiny::orthogonal
Vector< T_, m_ > orthogonal(const Matrix< T_, m_, m_-1, sm_, sn_ > &tau)
Computes the positively oriented orthogonal vector to the columns of a m_ x (m_-1) Matrix.

FEAT::Tiny::operator-
CUDA_HOST_DEVICE Vector< T_, n_ > operator-(const Vector< T_, n_, sa_ > &a, const Vector< T_, n_, sb_ > &b)
vector subtraction operator
Definition: tiny_algebra.hpp:1238

FEAT::Tiny::component_product
CUDA_HOST_DEVICE Vector< T_, n_ > component_product(const Vector< T_, n_, sa_ > &a, const Vector< T_, n_, sb_ > &b)
vector element-wise-product operator
Definition: tiny_algebra.hpp:1224

FEAT::Tiny::operator*
CUDA_HOST_DEVICE Vector< T_, n_ > operator*(typename Vector< T_, n_ >::DataType alpha, const Vector< T_, n_, s_ > &x)
scalar-left-multiplication operator
Definition: tiny_algebra.hpp:1210

FEAT::Tiny::calculate_opening_angle
CUDA_HOST T_ calculate_opening_angle(const Vector< T_, 2 > &x, const Vector< T_, 2 > &y)
Calculates the counter-clockwise opening angle between two 2D vectors.
Definition: tiny_algebra.hpp:1250

FEAT
FEAT namespace.
Definition: adjactor.hpp:12

FEAT::stringify
String stringify(const T_ &item)
Converts an item into a String.
Definition: string.hpp:944

FEAT::SpaceTags::value
@ value
specifies whether the space should supply basis function values