11#include <kernel/util/math.hpp> 
   16#include <initializer_list> 
   18#include <kernel/util/cuda_math.cuh> 
  129      template<
typename T_>
 
  130      struct DataTypeExtractor
 
  133        typedef T_ MyDataType;
 
  135        static constexpr int level = 0;
 
  152      template<
typename T_, 
int n_, 
int s_>
 
  153      struct DataTypeExtractor<Vector<T_, n_, s_>>
 
  156        typedef typename DataTypeExtractor<T_>::MyDataType MyDataType;
 
  158        static constexpr int level = DataTypeExtractor<T_>::level+1;
 
  162      template<
typename T_, 
int m_, 
int n_, 
int sm_, 
int sn_>
 
  163      struct DataTypeExtractor<Matrix<T_, m_, n_, sm_, sn_>>
 
  166        typedef typename DataTypeExtractor<T_>::MyDataType MyDataType;
 
  168        static constexpr int level = DataTypeExtractor<T_>::level+1;
 
  172      template<
typename T_, 
int l_, 
int m_, 
int n_, 
int sl_, 
int sm_, 
int sn_>
 
  173      struct DataTypeExtractor<Tensor3<T_, l_, m_, n_, sl_, sm_, sn_>>
 
  176        typedef typename DataTypeExtractor<T_>::MyDataType MyDataType;
 
  178        static constexpr int level = DataTypeExtractor<T_>::level+1;
 
  182      template<
int m_, 
int n_>
 
  185      template<
int m_, 
int n_>
 
  188      template<
int m_, 
int n_>
 
  189      struct InverseHelper;
 
  191      template<
int m_, 
int n_>
 
  192      struct CofactorHelper;
 
  195      template<
int m_, 
int n_>
 
  196      struct CudaGroupedInverseHelper;
 
  207    template<
typename T_, 
int n_, 
int s_>
 
  210      static_assert(n_ > 0, 
"invalid vector length");
 
  211      static_assert(s_ >= n_, 
"invalid vector stride");
 
  215      static constexpr int n = n_;
 
  217      static constexpr int s = s_;
 
  222      typedef typename Intern::DataTypeExtractor<ValueType>::MyDataType 
DataType;
 
  235        for(
int i(0); i < n_; ++i)
 
  245        for(
int i(0); i < n_; ++i)
 
  252      template<
typename Tx_, 
int sx_>
 
  256        for(
int i(0); i < n_; ++i)
 
  258          if constexpr(std::is_same<T_, DataType>::value)
 
  261            v[i] = T_::convert(x.v[i]);
 
  284      template<
typename Tx_>
 
  285      CUDA_HOST_DEVICE 
explicit Vector(
const std::initializer_list<Tx_>& x)
 
  287        XASSERTM(std::size_t(n_) == x.size(), 
"invalid initializer list size");
 
  289        for(
int i(0); i < n_; ++i, ++it)
 
  296        for(
int i(0); i < n_; ++i)
 
  307        for(
int i(0); i < n_; ++i)
 
  328      template<
typename Tx_>
 
  331        XASSERTM(std::size_t(n_) == x.size(), 
"invalid initializer list size");
 
  333        for(
int i(0); i < n_; ++i, ++it)
 
  339      template<
typename Tx_, 
int sx_>
 
  342        for(
int i(0); i < n_; ++i)
 
  356        ASSERTM((i >= 0) && (i < n_), 
"index i out-of-bounds");
 
  363        ASSERTM((i >= 0) && (i < n_), 
"index i out-of-bounds");
 
  370        ASSERTM((i >= 0) && (i < n_), 
"index i out-of-bounds");
 
  377        ASSERTM((i >= 0) && (i < n_), 
"index i out-of-bounds");
 
  390        for(
int i(0); i < n_; ++i)
 
  405      template<
int nn_, 
int nx_, 
int snx_>
 
  408        static_assert(nn_ <= n_, 
"invalid copy_n size");
 
  409        static_assert(nn_ <= nx_, 
"invalid copy_n size");
 
  410        for(
int i(0); i < nn_; ++i)
 
  419        for(
int i(0); i < n_; ++i)
 
  430        for(
int i(0); i < n_; ++i)
 
  441        for(
int i(0); i < n_; ++i)
 
  452        for(
int i(0); i < n_; ++i)
 
  467        for(
int i(0); i < n_; ++i)
 
  485        static_assert(nn_ <= n_, 
"invalid format_n size");
 
  486        for(
int i(0); i < nn_; ++i)
 
  500        for(
int i(0); i < n_; ++i)
 
  518        static_assert(nn_ <= n_, 
"invalid scale_n size");
 
  519        for(
int i(0); i < nn_; ++i)
 
  534        ASSERTM(norm2 > Math::eps<DataType>(), 
"Trying to normalize a null vector!");
 
  535        return ((*
this) *= (
DataType(1)/norm2));
 
  538        ASSERTM(norm2_sqr > CudaMath::cuda_get_eps<DataType>(), 
"Trying to normalize a null vector!");
 
  539        return ((*
this) *= CudaMath::cuda_rsqrt(norm2_sqr));
 
  554        static_assert(nn_ <= n_, 
"invalid normalize_n size");
 
  556        const DataType norm2(this->
template norm_euclid_n<nn_>());
 
  557        ASSERTM(norm2 > Math::eps<DataType>(), 
"Trying to normalize a null vector!");
 
  558        this->
template scale_n<nn_>(
DataType(1)/norm2);
 
  561        const DataType norm2_sqr(this->
template norm_euclid_sqr_n<nn_>());
 
  562        ASSERTM(norm2_sqr > CudaMath::cuda_get_eps<DataType>(), 
"Trying to normalize a null vector!");
 
  563        this->
template scale_n<nn_>(CudaMath::cuda_rsqrt(norm2_sqr));
 
  575        for(
int i(0); i < n_; ++i)
 
  591        static_assert(nn_ <= n_, 
"invalid negate_n size");
 
  592        for(
int i(0); i < nn_; ++i)
 
  611        for(
int i(0); i < n_; ++i)
 
  612          v[i] += alpha * x.v[i];
 
  630      template<
int nn_, 
int nx_, 
int snx_>
 
  633        static_assert(nn_ <= n_, 
"invalid negate_n size");
 
  634        static_assert(nn_ <= nx_, 
"invalid negate_n size");
 
  635        for(
int i(0); i < nn_; ++i)
 
  636          v[i] += alpha * x.v[i];
 
  657      template<
int sna_, 
int snb_>
 
  660        for(
int i(0); i < n_; ++i)
 
  661          v[i] = (T_(1) - alpha) * a.v[i] + alpha * b.v[i];
 
  685      template<
int nn_, 
int na_, 
int nb_, 
int sna_, 
int snb_>
 
  688        static_assert(nn_ <= n_, 
"invalid set_convex_n size");
 
  689        static_assert(nn_ <= na_, 
"invalid set_convex_n size");
 
  690        static_assert(nn_ <= nb_, 
"invalid set_convex_n size");
 
  691        for(
int i(0); i < nn_; ++i)
 
  692          v[i] = (T_(1) - alpha) * a.v[i] + alpha * b.v[i];
 
  711      template<
int m_, 
int sma_, 
int sna_, 
int sx_>
 
  715        ASSERTM((
const void*)
this != (
const void*)&x, 
"result vector and multiplicand vector 'x' must be different objects");
 
  717        for(
int i(0); i < n_; ++i)
 
  720          for(
int j(0); j < m_; ++j)
 
  722            v[i] += a.
v[i][j] * x.
v[j];
 
  747      template<
int mm_, 
int nn_, 
int ma_, 
int na_, 
int sna_, 
int sma_, 
int nx_, 
int sx_>
 
  750        static_assert(mm_ <= n_, 
"invalid set_mat_vec_mult_n size");
 
  751        static_assert(mm_ <= ma_, 
"invalid set_mat_vec_mult_n size");
 
  752        static_assert(nn_ <= nx_, 
"invalid set_mat_vec_mult_n size");
 
  753        static_assert(nn_ <= na_, 
"invalid set_mat_vec_mult_n size");
 
  756        ASSERTM((
const void*)
this != (
const void*)&x, 
"result vector and multiplicand vector 'x' must be different objects");
 
  758        for(
int i(0); i < mm_; ++i)
 
  761          for(
int j(0); j < nn_; ++j)
 
  763            v[i] += a.
v[i][j] * x.
v[j];
 
  784      template<
int m_, 
int sma_, 
int sna_, 
int sx_>
 
  788        ASSERTM((
const void*)
this != (
const void*)&x, 
"result vector and multiplicand vector 'x' must be different objects");
 
  790        for(
int j(0); j < n_; ++j)
 
  793          for(
int i(0); i < m_; ++i)
 
  795            v[j] += a.
v[i][j] * x.
v[i];
 
  820      template<
int nn_, 
int mm_, 
int mx_, 
int smx_, 
int ma_, 
int na_, 
int sma_, 
int sna_>
 
  823        static_assert(mm_ <= mx_, 
"invalid set_mat_vec_mult_n size");
 
  824        static_assert(mm_ <= ma_, 
"invalid set_mat_vec_mult_n size");
 
  825        static_assert(nn_ <= n_, 
"invalid set_mat_vec_mult_n size");
 
  826        static_assert(nn_ <= na_, 
"invalid set_mat_vec_mult_n size");
 
  829        ASSERTM((
const void*)
this != (
const void*)&x, 
"result vector and multiplicand vector 'x' must be different objects");
 
  831        for(
int j(0); j < nn_; ++j)
 
  834          for(
int i(0); i < mm_; ++i)
 
  836            v[j] += a.
v[i][j] * x.
v[i];
 
  860      template<
int m_, 
int sma_, 
int sna_, 
int sx_>
 
  864        ASSERTM((
const void*)
this != (
const void*)&x, 
"result vector and multiplicand vector 'x' must be different objects");
 
  866        for(
int i(0); i < n_; ++i)
 
  868          for(
int j(0); j < m_; ++j)
 
  870            v[i] += alpha * a.
v[i][j] * x.
v[j];
 
  900      template<
int mm_, 
int nn_, 
int ma_, 
int na_, 
int sna_, 
int sma_, 
int nx_, 
int sx_>
 
  903        static_assert(mm_ <= n_, 
"invalid add_mat_vec_mult_n size");
 
  904        static_assert(mm_ <= ma_, 
"invalid add_mat_vec_mult_n size");
 
  905        static_assert(nn_ <= nx_, 
"invalid add_mat_vec_mult_n size");
 
  906        static_assert(nn_ <= na_, 
"invalid add_mat_vec_mult_n size");
 
  909        ASSERTM((
const void*)
this != (
const void*)&x, 
"result vector and multiplicand vector 'x' must be different objects");
 
  911        for(
int i(0); i < mm_; ++i)
 
  913          for(
int j(0); j < nn_; ++j)
 
  915            v[i] += alpha * a.
v[i][j] * x.
v[j];
 
  939      template<
int m_, 
int sma_, 
int sna_, 
int sx_>
 
  943        ASSERTM((
const void*)
this != (
const void*)&x, 
"result vector and multiplicand vector 'x' must be different objects");
 
  945        for(
int j(0); j < n_; ++j)
 
  947          for(
int i(0); i < m_; ++i)
 
  949            v[j] += alpha * a.
v[i][j] * x.
v[i];
 
  979      template<
int nn_, 
int mm_, 
int mx_, 
int smx_, 
int ma_, 
int na_, 
int sma_, 
int sna_>
 
  982        static_assert(mm_ <= mx_, 
"invalid add_vec_mat_mult_n size");
 
  983        static_assert(mm_ <= ma_, 
"invalid add_vec_mat_mult_n size");
 
  984        static_assert(nn_ <= n_, 
"invalid add_vec_mat_mult_n size");
 
  985        static_assert(nn_ <= na_, 
"invalid add_vec_mat_mult_n size");
 
  988        ASSERTM((
const void*)
this != (
const void*)&x, 
"result vector and multiplicand vector 'x' must be different objects");
 
  990        for(
int j(0); j < nn_; ++j)
 
  992          for(
int i(0); i < mm_; ++i)
 
  994            v[j] += alpha * a.
v[i][j] * x.
v[i];
 
 1009        for(
int i(0); i < n_; ++i)
 
 1014          r += CudaMath::cuda_sqr(
v[i]);
 
 1029        static_assert(nn_ <= n_, 
"invalid norm_euclid_sqr_n size");
 
 1031        for(
int i(0); i < nn_; ++i)
 
 1036          r += CudaMath::cuda_sqr(
v[i]);
 
 1066        static_assert(nn_ <= n_, 
"invalid norm_euclid_n size");
 
 1068        return Math::sqrt(this->
template norm_euclid_sqr_n<nn_>());
 
 1070        return CudaMath::cuda_sqrt(this->
template norm_euclid_sqr_n<nn_>());
 
 1083        for(
int i(0); i < n_; ++i)
 
 1088          r += CudaMath::cuda_abs(
v[i]);
 
 1103        static_assert(nn_ <= n_, 
"invalid norm_l1_n size");
 
 1105        for(
int i(0); i < nn_; ++i)
 
 1110          r += CudaMath::cuda_abs(
v[i]);
 
 1125        for(
int i(0); i < n_; ++i)
 
 1130          r = CudaMath::cuda_max(r, CudaMath::cuda_abs(
v[i]));
 
 1145        static_assert(nn_ <= n_, 
"invalid norm_l1_n size");
 
 1147        for(
int i(0); i < nn_; ++i)
 
 1152          r = CudaMath::cuda_max(r, CudaMath::cuda_abs(
v[i]));
 
 1183        for (
int i(0) ; i < b.n ; ++i)
 
 1193    template<
typename T_, 
int sx_, 
int sa_>
 
 1194    inline void cross(Vector<T_, 2, sx_>& x, 
const Vector<T_, 2, sa_>& a)
 
 1200    template<
typename T_, 
int sx_, 
int sa_, 
int sb_>
 
 1201    inline void cross(Vector<T_, 3, sx_>& x, 
const Vector<T_, 3, sa_>& a, 
const Vector<T_, 3, sb_>& b)
 
 1203      x.v[0] = a.v[1]*b.v[2] - a.v[2]*b.v[1];
 
 1204      x.v[1] = a.v[2]*b.v[0] - a.v[0]*b.v[2];
 
 1205      x.v[2] = a.v[0]*b.v[1] - a.v[1]*b.v[0];
 
 1209    template<
typename T_, 
int n_, 
int s_>
 
 1216    template<
typename T_, 
int n_, 
int s_>
 
 1223    template<
typename T_, 
int n_, 
int sa_, 
int sb_>
 
 1230    template<
typename T_, 
int n_, 
int sa_, 
int sb_>
 
 1237    template<
typename T_, 
int n_, 
int sa_, 
int sb_>
 
 1249    template<
typename T_>
 
 1253      XABORTM(
"calculate_opening_angle not implemented for CUDA");
 
 1268    template<
typename T_, 
int dim_>
 
 1271      T_ norm2(y.template norm_euclid_n<dim_>());
 
 1272      if(norm2 < Math::eps<T_>())
 
 1274      const auto tmp_normalized = (T_(1)/norm2) * y;
 
 1275      return dot(x, tmp_normalized) * tmp_normalized;
 
 1285    template<
typename T_, 
int m_, 
int n_, 
int sm_, 
int sn_>
 
 1288      static_assert(m_ > 0, 
"invalid row count");
 
 1289      static_assert(n_ > 0, 
"invalid column count");
 
 1290      static_assert(sm_ >= m_, 
"invalid row stride");
 
 1291      static_assert(sn_ >= n_, 
"invalid column stride");
 
 1295      static constexpr int m = m_;
 
 1297      static constexpr int n = n_;
 
 1299      static constexpr int sm = sm_;
 
 1301      static constexpr int sn = sn_;
 
 1306      typedef typename Intern::DataTypeExtractor<ValueType>::MyDataType 
DataType;
 
 1321        for(
int i(0); i < m_; ++i)
 
 1328      template<
typename T2_, 
int sma_, 
int sna_>
 
 1331        for(
int i(0); i < m_; ++i)
 
 1333          for(
int j(0); j < n_; ++j)
 
 1335            v[i][j] = T_(a.
v[i][j]);
 
 1351      template<
typename Tx_>
 
 1352      CUDA_HOST_DEVICE 
explicit Matrix(
const std::initializer_list<Tx_>& x)
 
 1354        XASSERTM(std::size_t(m_) == x.size(), 
"invalid initializer list size");
 
 1356        for(
int i(0); i < m_; ++i, ++it)
 
 1373      template<
typename Tx_>
 
 1374      CUDA_HOST_DEVICE 
explicit Matrix(
const std::initializer_list<std::initializer_list<Tx_>>& x)
 
 1376        XASSERTM(std::size_t(m_) == x.size(), 
"invalid initializer list size");
 
 1378        for(
int i(0); i < m_; ++i, ++it)
 
 1385        for(
int i(0); i < m_; ++i)
 
 1393      template<
int sma_, 
int sna_>
 
 1396        for(
int i(0); i < m_; ++i)
 
 1415      template<
typename Tx_>
 
 1418        XASSERTM(std::size_t(m_) == x.size(), 
"invalid initializer list size");
 
 1420        for(
int i(0); i < m_; ++i, ++it)
 
 1436      template<
typename Tx_>
 
 1437      CUDA_HOST_DEVICE 
Matrix& 
operator=(
const std::initializer_list<std::initializer_list<Tx_>>& x)
 
 1439        XASSERTM(std::size_t(m_) == x.size(), 
"invalid initializer list size");
 
 1441        for(
int i(0); i < m_; ++i, ++it)
 
 1447      template<
typename Tx_, 
int sma_, 
int sna_>
 
 1450        for(
int i(0); i < m_; ++i)
 
 1451          v[i].convert(a.
v[i]);
 
 1465        ASSERTM( (i >= 0) && (i < m_), 
"index i out-of-bounds");
 
 1466        ASSERTM( (j >= 0) && (j < n_), 
"index j out-of-bounds");
 
 1473        ASSERTM( (i >= 0) && (i < m_), 
"index i out-of-bounds");
 
 1474        ASSERTM( (j >= 0) && (j < n_), 
"index j out-of-bounds");
 
 1489        ASSERTM( (i >= 0) && (i <m_), 
"index i out-of-bounds");
 
 1496        ASSERTM( (i >= 0) && (i <m_), 
"index i out-of-bounds");
 
 1503        for(
int i(0); i < m_; ++i)
 
 1511      template<
int sma_, 
int sna_>
 
 1514        for(
int i(0); i < m_; ++i)
 
 1522      template<
int sma_, 
int sna_>
 
 1525        for(
int i(0); i < m_; ++i)
 
 1538      template<
int sma_, 
int sna_>
 
 1541        for(
int i(0); i < m_; ++i)
 
 1542          for(
int j(0); j < n_; ++j)
 
 1543            v[i][j] = a.
v[i][j];
 
 1558      template<
int mm_, 
int nn_, 
int ma_, 
int na_, 
int sma_, 
int sna_>
 
 1561        static_assert(mm_ <= m_, 
"invalid copy_n size");
 
 1562        static_assert(mm_ <= ma_, 
"invalid copy_n size");
 
 1563        static_assert(nn_ <= n_, 
"invalid copy_n size");
 
 1564        static_assert(nn_ <= na_, 
"invalid copy_n size");
 
 1565        for(
int i(0); i < mm_; ++i)
 
 1566          for(
int j(0); j < nn_; ++j)
 
 1567            v[i][j] = a.
v[i][j];
 
 1578        for(
int i(0); i < m_; ++i)
 
 1598        for(
int i(0); i < m_; ++i)
 
 1603          r += CudaMath::cuda_sqr(v[i][i]);
 
 1605          for(
int j(0); j < n_; ++j)
 
 1610            r += CudaMath::cuda_sqr(v[i][j]);
 
 1630        return CudaMath::cuda_sqrt(norm_frobenius_sqr());
 
 1645        for(
int i(0); i < m_; ++i)
 
 1647          for(
int j(0); j < n_; ++j)
 
 1652            r += CudaMath::cuda_sqr(v[i][j]);
 
 1667        for(
int i(0); i < m_; ++i)
 
 1669          for(
int j(0); j < n_; ++j)
 
 1674            r += CudaMath::cuda_sqr(v[i][j] - 
DataType(i == j ? 1 : 0));
 
 1681        return CudaMath::cuda_sqrt(r);
 
 1699        int k = CudaMath::cuda_min(m_, n_);
 
 1702        for(
int i(0); i < k; ++i)
 
 1718        return Intern::DetHelper<m_, n_>::compute(*
this);
 
 1734        return Intern::VolHelper<m_, n_>::compute(*
this);
 
 1747      template<
int sma_, 
int sna_>
 
 1751        ASSERTM((
const void*)
this != (
const void*)&a, 
"result matrix and input matrix 'a' must be different objects");
 
 1752        Intern::InverseHelper<m_, n_>::compute(*
this, a);
 
 1772      template<
typename ThreadGroup_, 
int sma_, 
int sna_>
 
 1776        ASSERTM((
const void*)
this != (
const void*)&a, 
"result matrix and input matrix 'a' must be different objects");
 
 1777        Intern::CudaGroupedInverseHelper<m_, n_>::compute(tg, *
this, a, det);
 
 1799      template<
int sma_, 
int sna_>
 
 1803        ASSERTM((
const void*)
this != (
const void*)&a, 
"result matrix and input matrix 'a' must be different objects");
 
 1804        Intern::CofactorHelper<m_, n_>::compute(*
this, a);
 
 1816      template<
int sma_, 
int sna_>
 
 1820        ASSERTM((
const void*)
this != (
const void*)&a, 
"result matrix and input matrix 'a' must be different objects");
 
 1822        for(
int i(0); i < m_; ++i)
 
 1824          for(
int j(0); j < n_; ++j)
 
 1826            v[i][j] = a.
v[j][i];
 
 1844      template<
int l_, 
int sla_, 
int sna_>
 
 1847        static_assert(m_ == n_, 
"Gram matrices must be square");
 
 1851        for(
int k(0); k < l_; ++k)
 
 1853          for(
int i(0); i < n_; ++i)
 
 1855            for(
int j(0); j < n_; ++j)
 
 1857              v[i][j] += a.
v[k][i] * a.
v[k][j];
 
 1880      template<
int snx_, 
int sny_>
 
 1884        for(
int i(0); i < m_; ++i)
 
 1886          r += x[i] * 
dot(v[i], y);
 
 1908      template<
int snx_, 
int sny_>
 
 1914        for(
int i(0); i < m_; ++i)
 
 1916          for(
int j(0); j < n_; ++j)
 
 1918            v[i][j] += alpha * x[i] * y[j];
 
 1938      template<
int snx_, 
int sny_>
 
 1943        for(
int i(0); i < m_; ++i)
 
 1945          for(
int j(0); j < n_; ++j)
 
 1947            v[i][j] = x[i] * y[j];
 
 1964      template<
int sma_, 
int sna_>
 
 1967        for(
int i(0); i < m_; ++i)
 
 1969          for(
int j(0); j < n_; ++j)
 
 1971            v[i][j] += alpha * a.
v[i][j];
 
 1985        for(
int i(0); (i < m_) && (i < n_); ++i)
 
 2008      template<
int la_, 
int lb_, 
int sma_, 
int sna_, 
int smb_, 
int snb_>
 
 2015        ASSERTM((
const void*)
this != (
const void*)&a, 
"result matrix and multiplicand matrix 'a' must be different objects");
 
 2016        ASSERTM((
const void*)
this != (
const void*)&b, 
"result matrix and multiplicand matrix 'b' must be different objects");
 
 2017        ASSERTM(la_ == lb_, 
"second dimension of a must be equal to first dimension of b");
 
 2019        for(
int i(0); i < m_; ++i)
 
 2021          for(
int j(0); j < n_; ++j)
 
 2024            for(
int k(0); k < la_; ++k)
 
 2026              r += a.v[i][k] * b.v[k][j];
 
 2028            v[i][j] += alpha * r;
 
 2049      template<
int la_, 
int lb_, 
int sma_, 
int sna_, 
int smb_, 
int snb_>
 
 2053        ASSERTM((
const void*)
this != (
const void*)&a, 
"result matrix and multiplicand matrix 'a' must be different objects");
 
 2054        ASSERTM((
const void*)
this != (
const void*)&b, 
"result matrix and multiplicand matrix 'b' must be different objects");
 
 2055        ASSERTM(la_ == lb_, 
"second dimension of a must be equal to first dimension of b");
 
 2058        return add_mat_mat_mult(a, b);
 
 2082      template<
int k_, 
int l_, 
int sma_, 
int sna_, 
int smb_, 
int snb_, 
int smd_, 
int snd_>
 
 2090        ASSERTM((
const void*)
this != (
const void*)&a, 
"result matrix and multiplicand matrix 'a' must be different objects");
 
 2091        ASSERTM((
const void*)
this != (
const void*)&b, 
"result matrix and multiplicand matrix 'b' must be different objects");
 
 2092        ASSERTM((
const void*)
this != (
const void*)&d, 
"result matrix and multiplicand matrix 'd' must be different objects");
 
 2094        for(
int i(0); i < m_; ++i)
 
 2096          for(
int j(0); j < n_; ++j)
 
 2099            for(
int p(0); p < k_; ++p)
 
 2102              for(
int q(0); q < l_; ++q)
 
 2104                t += a(p,q) * d(q,j);
 
 2108            v[i][j] += alpha * r;
 
 2135      template<
int k_, 
int l_, 
int sma_, 
int sna_, 
int smb_, 
int snb_, 
int smd_, 
int snd_>
 
 2143        ASSERTM((
const void*)
this != (
const void*)&a, 
"result matrix and multiplicand matrix 'a' must be different objects");
 
 2144        ASSERTM((
const void*)
this != (
const void*)&b, 
"result matrix and multiplicand matrix 'b' must be different objects");
 
 2145        ASSERTM((
const void*)
this != (
const void*)&d, 
"result matrix and multiplicand matrix 'd' must be different objects");
 
 2148        return add_double_mat_mult(a, b, d, alpha);
 
 2171      template<
int l_, 
int snv_, 
int slt_, 
int smt_, 
int snt_>
 
 2177        for(
int i(0); i < m_; ++i)
 
 2179          for(
int j(0); j < n_; ++j)
 
 2182            for(
int k(0); k < l_; ++k)
 
 2184              r += x(k) * t(k,i,j);
 
 2186            v[i][j] += alpha * r;
 
 2212      template<
int l_, 
int snv_, 
int slt_, 
int smt_, 
int snt_>
 
 2219        return add_vec_tensor_mult(x, t, alpha);
 
 2229        for(
int i(0); i < m_; ++i)
 
 2230          for(
int j(0); j < n_; ++j)
 
 2231            v[i][j] = (i == j ? T_(1) : T_(0));
 
 2245        static_assert((m_ == 2) && (n_ == 2), 
"this function works only for 2x2 matrices");
 
 2248        v[0][1] = -(v[1][0] = 
Math::sin(angle));
 
 2250        v[0][0] =  (v[1][1] = CudaMath::cuda_cos(angle));
 
 2251        v[0][1] = -(v[1][0] = CudaMath::cuda_sin(angle));
 
 2272        static_assert((m_ == 3) && (n_ == 3), 
"this function works only for 3x3 matrices");
 
 2281        const T_ cy = CudaMath::cuda_cos(yaw);
 
 2282        const T_ sy = CudaMath::cuda_sin(yaw);
 
 2283        const T_ cp = CudaMath::cuda_cos(pitch);
 
 2284        const T_ sp = CudaMath::cuda_sin(pitch);
 
 2285        const T_ cr = CudaMath::cuda_cos(roll);
 
 2286        const T_ sr = CudaMath::cuda_sin(roll);
 
 2289        v[0][1] = cy*sp*sr - sy*cr;
 
 2290        v[0][2] = cy*sp*cr + sy*sr;
 
 2292        v[1][1] = sy*sp*sr + cy*cr;
 
 2293        v[1][2] = sy*sp*cr - cy*sr;
 
 2314      CUDA_HOST 
friend std::ostream & operator<< (std::ostream & lhs, 
const Matrix& A)
 
 2316        for (
int i(0) ; i < m-1 ; ++i)
 
 2318          lhs << A[i] << 
"\n";
 
 2335    template<
typename T_, 
int mx_, 
int smx_, 
int ma_, 
int na_, 
int sm_, 
int sn_>
 
 2338      static_assert(mx_ >= 2, 
"invalid nu vector size for orthogonal_2x1");
 
 2339      static_assert(ma_ >= 2, 
"invalid matrix row size for orthogonal_2x1");
 
 2340      static_assert(na_ >= 1, 
"invalid matrix column size for orthogonal_2x1");
 
 2356    template<
typename T_, 
int mx_, 
int smx_, 
int ma_, 
int na_, 
int sm_, 
int sn_>
 
 2359      static_assert(mx_ >= 3, 
"invalid nu vector size for orthogonal_3x2");
 
 2360      static_assert(ma_ >= 3, 
"invalid matrix row size for orthogonal_3x2");
 
 2361      static_assert(na_ >= 2, 
"invalid matrix column size for orthogonal_3x2");
 
 2364      nu[0] = tau[1][0]*tau[2][1] - tau[2][0]*tau[1][1];
 
 2365      nu[1] = tau[2][0]*tau[0][1] - tau[0][0]*tau[2][1];
 
 2366      nu[2] = tau[0][0]*tau[1][1] - tau[1][0]*tau[0][1];
 
 2393    template<
typename T_, 
int m_, 
int sm_, 
int sn_>
 
 2398    template<
typename T_, 
int sm_, 
int sn_>
 
 2406    template<
typename T_, 
int sm_, 
int sn_>
 
 2407    CUDA_HOST_DEVICE Vector<T_, 3> 
orthogonal(
const Matrix<T_, 3, 2, sm_, sn_>& tau)
 
 2409      Vector<T_, 3, sm_> nu(T_(0));
 
 2416    template<
typename T_, 
int m_, 
int n_, 
int sm_, 
int sn_, 
int sx_>
 
 2423    template<
typename T_, 
int m_, 
int n_, 
int sm_, 
int sn_, 
int sx_>
 
 2430    template<
typename T_, 
int m_, 
int n_, 
int sm_, 
int sn_>
 
 2437    template<
typename T_, 
int m_, 
int n_, 
int sm_, 
int sn_>
 
 2438    CUDA_HOST_DEVICE 
inline Matrix<T_, m_, n_, sm_, sn_> operator*(
const Matrix<T_, m_, n_, sm_, sn_>& a, 
typename Matrix<T_, m_, n_>::DataType alpha)
 
 2444    template<
typename T_, 
int m_, 
int n_, 
int l_, 
int sma_, 
int sna_, 
int smb_, 
int snb_>
 
 2445    CUDA_HOST_DEVICE 
inline Matrix<T_, m_, n_> operator*(
const Matrix<T_, m_, l_, sma_, sna_>& a, 
const Matrix<T_, l_, n_, smb_, snb_>& b)
 
 2451    template<
typename T_, 
int m_, 
int n_,
int sma_, 
int sna_, 
int smb_, 
int snb_>
 
 2452    CUDA_HOST_DEVICE 
inline Matrix<T_, m_, n_> operator+(
const Matrix<T_, m_, n_, sma_, sna_>& a, 
const Matrix<T_, m_, n_, smb_, snb_>& b)
 
 2458    template<
typename T_, 
int m_, 
int n_,
int sma_, 
int sna_, 
int smb_, 
int snb_>
 
 2459    CUDA_HOST_DEVICE 
inline Matrix<T_, m_, n_> operator-(
const Matrix<T_, m_, n_, sma_, sna_>& a, 
const Matrix<T_, m_, n_, smb_, snb_>& b)
 
 2470    template<
typename T_, 
int l_, 
int m_, 
int n_, 
int sl_, 
int sm_, 
int sn_>
 
 2473      static_assert(l_ > 0, 
"invalid tube count");
 
 2474      static_assert(m_ > 0, 
"invalid row count");
 
 2475      static_assert(n_ > 0, 
"invalid column count");
 
 2476      static_assert(sl_ >= l_, 
"invalid tube stride");
 
 2477      static_assert(sm_ >= m_, 
"invalid row stride");
 
 2478      static_assert(sn_ >= n_, 
"invalid column stride");
 
 2482      static constexpr int l = l_;
 
 2484      static constexpr int m = m_;
 
 2486      static constexpr int n = n_;
 
 2488      static constexpr int sl = sl_;
 
 2490      static constexpr int sm = sm_;
 
 2492      static constexpr int sn = sn_;
 
 2497      typedef typename Intern::DataTypeExtractor<ValueType>::MyDataType 
DataType;
 
 2512        for(
int i(0); i < l_; ++i)
 
 2522      template<
typename Tx_>
 
 2523      CUDA_HOST_DEVICE 
explicit Tensor3(
const std::initializer_list<Tx_>& x)
 
 2525        XASSERTM(std::size_t(l_) == x.size(), 
"invalid initializer list size");
 
 2527        for(
int i(0); i < l_; ++i, ++it)
 
 2537      template<
typename Tx_>
 
 2538      CUDA_HOST_DEVICE 
explicit Tensor3(
const std::initializer_list<std::initializer_list<std::initializer_list<Tx_>>>& x)
 
 2540        XASSERTM(std::size_t(l_) == x.size(), 
"invalid initializer list size");
 
 2542        for(
int i(0); i < l_; ++i, ++it)
 
 2547      template<
int sla_, 
int sma_, 
int sna_>
 
 2550        for(
int i(0); i < l_; ++i)
 
 2557        for(
int i(0); i < l_; ++i)
 
 2563      template<
int sla_, 
int sma_, 
int sna_>
 
 2566        for(
int i(0); i < l_; ++i)
 
 2579      template<
typename Tx_>
 
 2582        XASSERTM(std::size_t(l_) == x.size(), 
"invalid initializer list size");
 
 2584        for(
int i(0); i < l_; ++i, ++it)
 
 2597      template<
typename Tx_>
 
 2598      CUDA_HOST_DEVICE 
Tensor3& 
operator=(
const std::initializer_list<std::initializer_list<std::initializer_list<Tx_>>>& x)
 
 2600        XASSERTM(std::size_t(l_) == x.size(), 
"invalid initializer list size");
 
 2602        for(
int i(0); i < l_; ++i, ++it)
 
 2608      template<
typename Tx_, 
int sla_, 
int sma_, 
int sna_>
 
 2611        for(
int i(0); i < l_; ++i)
 
 2626        ASSERTM( (h >= 0) && (h < l_), 
"index h out-of-bounds");
 
 2627        ASSERTM( (i >= 0) && (i < m_), 
"index i out-of-bounds");
 
 2628        ASSERTM( (j >= 0) && (j < n_), 
"index j out-of-bounds");
 
 2635        ASSERTM( (h >= 0) && (h < l_), 
"index h out-of-bounds");
 
 2636        ASSERTM( (i >= 0) && (i < m_), 
"index i out-of-bounds");
 
 2637        ASSERTM( (j >= 0) && (j < n_), 
"index j out-of-bounds");
 
 2652        ASSERTM( (h >= 0) && (h < l_), 
"index h out-of-bounds");
 
 2659        ASSERTM( (h >= 0) && (h < l_), 
"index h out-of-bounds");
 
 2666        for(
int i(0); i < l_; ++i)
 
 2672      template<
int sla_, 
int sma_, 
int sna_>
 
 2675        for(
int i(0); i < l_; ++i)
 
 2681      template<
int sla_, 
int sma_, 
int sna_>
 
 2684        for(
int i(0); i < l_; ++i)
 
 2695      template<
int sla_, 
int sma_, 
int sna_>
 
 2698        for(
int i(0); i < l_; ++i)
 
 2699          for(
int j(0); j < m_; ++j)
 
 2700            for(
int k(0); k < n_; ++k)
 
 2701              v[i][j][k] = a.
v[i][j][k];
 
 2716      template<
int ll_,
int mm_, 
int nn_, 
int la_, 
int ma_, 
int na_, 
int sla_, 
int sma_, 
int sna_>
 
 2719        static_assert(ll_ <= l_, 
"invalid copy_n size");
 
 2720        static_assert(ll_ <= la_, 
"invalid copy_n size");
 
 2721        static_assert(mm_ <= m_, 
"invalid copy_n size");
 
 2722        static_assert(mm_ <= ma_, 
"invalid copy_n size");
 
 2723        static_assert(nn_ <= n_, 
"invalid copy_n size");
 
 2724        static_assert(nn_ <= na_, 
"invalid copy_n size");
 
 2725        for(
int i(0); i < ll_; ++i)
 
 2726          for(
int j(0); j < mm_; ++j)
 
 2727            for(
int k(0); k < nn_; ++k)
 
 2728              v[i][j][k] = a.
v[i][j][k];
 
 2757      template<
int k_, 
int sma_, 
int sna_, 
int slt_, 
int smt_, 
int snt_>
 
 2764        ASSERTM((
const void*)
this != (
const void*)&t, 
"result tensor and multiplicand tensor 't' must be different objects");
 
 2766        for(
int h(0); h < l_; ++h)
 
 2768          for(
int i(0); i < m_; ++i)
 
 2770            for(
int j(0); j < n_; ++j)
 
 2773              for(
int p(0); p < k_; ++p)
 
 2775                r += a(h,p) * t(p,i,j);
 
 2809        int lt_, 
int mt_, 
int nt_, 
 
 2810        int slt_, 
int smt_, 
int snt_, 
 
 2811        int smb_, 
int snb_, 
int smd_, 
int snd_> 
 
 2819        ASSERTM((
const void*)
this != (
const void*)&t, 
"result tensor and multiplicand tensor 't' must be different objects");
 
 2821        for(
int h(0); h < l_; ++h)
 
 2823          for(
int i(0); i < m_; ++i)
 
 2825            for(
int j(0); j < n_; ++j)
 
 2828              for(
int p(0); p < mt_; ++p)
 
 2830                for(
int q(0); q < nt_; ++q)
 
 2832                  r += t(h,p,q) * b(p,i) * d(q,j);
 
 2862      template<
int slx_, 
int sma_, 
int sna_>
 
 2868        for(
int h(0); h < l_; ++h)
 
 2870          for(
int i(0); i < m_; ++i)
 
 2872            for(
int j(0); j < n_; ++j)
 
 2891    template<
typename T_, 
int l_, 
int m_, 
int n_, 
int sl_, 
int sm_, 
int sn_>
 
 2893      typename Tensor3<T_, l_, m_, n_>::DataType alpha, 
const Tensor3<T_, l_, m_, n_, sl_, sm_, sn_>& a)
 
 2899    template<
typename T_, 
int l_, 
int m_, 
int n_, 
int sl_, 
int sm_, 
int sn_>
 
 2901      const Tensor3<T_, l_, m_, n_, sl_, sm_, sn_>& a, 
typename Tensor3<T_, l_, m_, n_, sl_, sm_, sn_>::DataType alpha)
 
 2922    template<typename T_, typename std::enable_if<Intern::DataTypeExtractor<T_>::level == 0, 
bool>::type = 
true>
 
 2923    CUDA_HOST_DEVICE 
inline T_ 
dot(
const T_& a, 
const T_& b)
 
 2939    template<
typename T_, 
int n_, 
int sa_, 
int sb_>
 
 2944      for(
int i(0); i < n_; ++i)
 
 2962    template<
typename T_, 
int m_, 
int n_, 
int sma_, 
int sna_, 
int smb_, 
int snb_>
 
 2963    CUDA_HOST_DEVICE 
inline typename Matrix<T_, m_, n_>::DataType dot(
const Matrix<T_, m_, n_, sma_, sna_>& a, 
const Matrix<T_, m_, n_, smb_, snb_>& b)
 
 2966      for(
int i(0); i < m_; ++i)
 
 2981    template<
typename T_, 
int l_, 
int m_, 
int n_, 
int sla_ ,
int sma_, 
int sna_, 
int slb_, 
int smb_, 
int snb_>
 
 2987      for(
int i(0); i < l_; ++i)
 
 3003    template<
typename T_>
 
 3004    CUDA_HOST_DEVICE 
inline void add_id(T_& x, 
const T_& alpha)
 
 3018    template<
typename T_, 
int n_, 
int sn_>
 
 3021      for(
int i(0); i < n_; ++i)
 
 3034    template<
typename T_, 
int n_, 
int sm_, 
int sn_>
 
 3037      for(
int i(0); i < n_; ++i)
 
 3050    template<
typename T_, 
int n_, 
int sl_, 
int sm_, 
int sn_>
 
 3051    CUDA_HOST_DEVICE 
inline void add_id(
Tensor3<T_, n_, n_, n_, sl_, sm_, sn_>& x, 
const typename Tensor3<T_, n_, n_, n_, sl_, sm_, sn_>::DataType& alpha)
 
 3053      for(
int i(0); i < n_; ++i)
 
 3071    template<
typename T_>
 
 3072    CUDA_HOST_DEVICE 
inline void axpy(T_& y, 
const T_& x, 
const T_& alpha)
 
 3091    template<
typename T_, 
int n_, 
int sn_>
 
 3097      for(
int i(0); i < n_; ++i)
 
 3098        axpy(y.v[i], x.v[i], alpha);
 
 3115    template<
typename T_, 
int m_, 
int n_, 
int sm_, 
int sn_>
 
 3121      for(
int i(0); i < m_; ++i)
 
 3122        axpy(y.
v[i], x.
v[i], alpha);
 
 3139    template<
typename T_, 
int l_, 
int m_, 
int n_, 
int sl_, 
int sm_, 
int sn_>
 
 3145      for(
int i(0); i < l_; ++i)
 
 3146        axpy(y.v[i], x.v[i], alpha);
 
 3160      struct DetHelper<n_,n_>
 
 3162        template<
typename T_, 
int sma_, 
int sna_>
 
 3167          const T_ det = Intern::InverseHelper<n_, n_>::compute(b, a);
 
 3171          return Math::isnormal(det) ? det : T_(0);
 
 3173          return CudaMath::cuda_isnormal(det) ? det : T_(0);
 
 3179      struct DetHelper<1,1>
 
 3181        template<
typename T_, 
int sma_, 
int sna_>
 
 3182        CUDA_HOST_DEVICE 
static T_ compute(
const Tiny::Matrix<T_, 1, 1, sma_, sna_>& a)
 
 3190      struct DetHelper<2,2>
 
 3192        template<
typename T_, 
int sma_, 
int sna_>
 
 3193        CUDA_HOST_DEVICE 
static T_ compute(
const Tiny::Matrix<T_, 2, 2, sma_, sna_>& a)
 
 3195          return a(0,0)*a(1,1) - a(0,1)*a(1,0);
 
 3200      struct DetHelper<3,3>
 
 3202        template<
typename T_, 
int sma_, 
int sna_>
 
 3203        CUDA_HOST_DEVICE 
static T_ compute(
const Tiny::Matrix<T_, 3, 3, sma_, sna_>& a)
 
 3205          return  a(0,0)*(a(1,1)*a(2,2) - a(1,2)*a(2,1))
 
 3206            + a(0,1)*(a(1,2)*a(2,0) - a(1,0)*a(2,2))
 
 3207            + a(0,2)*(a(1,0)*a(2,1) - a(1,1)*a(2,0));
 
 3212      struct DetHelper<4,4>
 
 3214        template<
typename T_, 
int sma_, 
int sna_>
 
 3215        CUDA_HOST_DEVICE 
static T_ compute(
const Tiny::Matrix<T_, 4, 4, sma_, sna_>& a)
 
 3220            a(2,0)*a(3,1) - a(2,1)*a(3,0),
 
 3221            a(2,0)*a(3,2) - a(2,2)*a(3,0),
 
 3222            a(2,0)*a(3,3) - a(2,3)*a(3,0),
 
 3223            a(2,1)*a(3,2) - a(2,2)*a(3,1),
 
 3224            a(2,1)*a(3,3) - a(2,3)*a(3,1),
 
 3225            a(2,2)*a(3,3) - a(2,3)*a(3,2)
 
 3229            + a(0,0) * (a(1,1)*w[5] - a(1,2)*w[4] + a(1,3)*w[3])
 
 3230            - a(0,1) * (a(1,0)*w[5] - a(1,2)*w[2] + a(1,3)*w[1])
 
 3231            + a(0,2) * (a(1,0)*w[4] - a(1,1)*w[2] + a(1,3)*w[0])
 
 3232            - a(0,3) * (a(1,0)*w[3] - a(1,1)*w[1] + a(1,2)*w[0]);
 
 3237      struct DetHelper<5,5>
 
 3239        template<
typename T_, 
int sma_, 
int sna_>
 
 3240        CUDA_HOST_DEVICE 
static T_ compute(
const Tiny::Matrix<T_, 5, 5, sma_, sna_>& a)
 
 3245            a(3,0)*a(4,1) - a(3,1)*a(4,0),
 
 3246            a(3,0)*a(4,2) - a(3,2)*a(4,0),
 
 3247            a(3,0)*a(4,3) - a(3,3)*a(4,0),
 
 3248            a(3,0)*a(4,4) - a(3,4)*a(4,0),
 
 3249            a(3,1)*a(4,2) - a(3,2)*a(4,1),
 
 3250            a(3,1)*a(4,3) - a(3,3)*a(4,1),
 
 3251            a(3,1)*a(4,4) - a(3,4)*a(4,1),
 
 3252            a(3,2)*a(4,3) - a(3,3)*a(4,2),
 
 3253            a(3,2)*a(4,4) - a(3,4)*a(4,2),
 
 3254            a(3,3)*a(4,4) - a(3,4)*a(4,3)
 
 3259            a(2,0)*v[4] - a(2,1)*v[1] + a(2,2)*v[0],
 
 3260            a(2,0)*v[5] - a(2,1)*v[2] + a(2,3)*v[0],
 
 3261            a(2,0)*v[6] - a(2,1)*v[3] + a(2,4)*v[0],
 
 3262            a(2,0)*v[7] - a(2,2)*v[2] + a(2,3)*v[1],
 
 3263            a(2,0)*v[8] - a(2,2)*v[3] + a(2,4)*v[1],
 
 3264            a(2,0)*v[9] - a(2,3)*v[3] + a(2,4)*v[2],
 
 3265            a(2,1)*v[7] - a(2,2)*v[5] + a(2,3)*v[4],
 
 3266            a(2,1)*v[8] - a(2,2)*v[6] + a(2,4)*v[4],
 
 3267            a(2,1)*v[9] - a(2,3)*v[6] + a(2,4)*v[5],
 
 3268            a(2,2)*v[9] - a(2,3)*v[8] + a(2,4)*v[7]
 
 3272            + a(0,0)*(a(1,1)*w[9] - a(1,2)*w[8] + a(1,3)*w[7] - a(1,4)*w[6])
 
 3273            - a(0,1)*(a(1,0)*w[9] - a(1,2)*w[5] + a(1,3)*w[4] - a(1,4)*w[3])
 
 3274            + a(0,2)*(a(1,0)*w[8] - a(1,1)*w[5] + a(1,3)*w[2] - a(1,4)*w[1])
 
 3275            - a(0,3)*(a(1,0)*w[7] - a(1,1)*w[4] + a(1,2)*w[2] - a(1,4)*w[0])
 
 3276            + a(0,4)*(a(1,0)*w[6] - a(1,1)*w[3] + a(1,2)*w[1] - a(1,3)*w[0]);
 
 3281      struct DetHelper<6,6>
 
 3283        template<
typename T_, 
int sma_, 
int sna_>
 
 3284        CUDA_HOST_DEVICE 
static T_ compute(
const Tiny::Matrix<T_, 6, 6, sma_, sna_>& a)
 
 3289            a(4,0)*a(5,1) - a(4,1)*a(5,0),
 
 3290            a(4,0)*a(5,2) - a(4,2)*a(5,0),
 
 3291            a(4,0)*a(5,3) - a(4,3)*a(5,0),
 
 3292            a(4,0)*a(5,4) - a(4,4)*a(5,0),
 
 3293            a(4,0)*a(5,5) - a(4,5)*a(5,0),
 
 3294            a(4,1)*a(5,2) - a(4,2)*a(5,1),
 
 3295            a(4,1)*a(5,3) - a(4,3)*a(5,1),
 
 3296            a(4,1)*a(5,4) - a(4,4)*a(5,1),
 
 3297            a(4,1)*a(5,5) - a(4,5)*a(5,1),
 
 3298            a(4,2)*a(5,3) - a(4,3)*a(5,2),
 
 3299            a(4,2)*a(5,4) - a(4,4)*a(5,2),
 
 3300            a(4,2)*a(5,5) - a(4,5)*a(5,2),
 
 3301            a(4,3)*a(5,4) - a(4,4)*a(5,3),
 
 3302            a(4,3)*a(5,5) - a(4,5)*a(5,3),
 
 3303            a(4,4)*a(5,5) - a(4,5)*a(5,4)
 
 3308            a(3,0)*v[ 5] - a(3,1)*v[ 1] + a(3,2)*v[ 0],
 
 3309            a(3,0)*v[ 6] - a(3,1)*v[ 2] + a(3,3)*v[ 0],
 
 3310            a(3,0)*v[ 7] - a(3,1)*v[ 3] + a(3,4)*v[ 0],
 
 3311            a(3,0)*v[ 8] - a(3,1)*v[ 4] + a(3,5)*v[ 0],
 
 3312            a(3,0)*v[ 9] - a(3,2)*v[ 2] + a(3,3)*v[ 1],
 
 3313            a(3,0)*v[10] - a(3,2)*v[ 3] + a(3,4)*v[ 1],
 
 3314            a(3,0)*v[11] - a(3,2)*v[ 4] + a(3,5)*v[ 1],
 
 3315            a(3,0)*v[12] - a(3,3)*v[ 3] + a(3,4)*v[ 2],
 
 3316            a(3,0)*v[13] - a(3,3)*v[ 4] + a(3,5)*v[ 2],
 
 3317            a(3,0)*v[14] - a(3,4)*v[ 4] + a(3,5)*v[ 3],
 
 3318            a(3,1)*v[ 9] - a(3,2)*v[ 6] + a(3,3)*v[ 5],
 
 3319            a(3,1)*v[10] - a(3,2)*v[ 7] + a(3,4)*v[ 5],
 
 3320            a(3,1)*v[11] - a(3,2)*v[ 8] + a(3,5)*v[ 5],
 
 3321            a(3,1)*v[12] - a(3,3)*v[ 7] + a(3,4)*v[ 6],
 
 3322            a(3,1)*v[13] - a(3,3)*v[ 8] + a(3,5)*v[ 6],
 
 3323            a(3,1)*v[14] - a(3,4)*v[ 8] + a(3,5)*v[ 7],
 
 3324            a(3,2)*v[12] - a(3,3)*v[10] + a(3,4)*v[ 9],
 
 3325            a(3,2)*v[13] - a(3,3)*v[11] + a(3,5)*v[ 9],
 
 3326            a(3,2)*v[14] - a(3,4)*v[11] + a(3,5)*v[10],
 
 3327            a(3,3)*v[14] - a(3,4)*v[13] + a(3,5)*v[12]
 
 3330          v[ 0] = a(2,0)*w[10] - a(2,1)*w[ 4] + a(2,2)*w[ 1] - a(2,3)*w[ 0];
 
 3331          v[ 1] = a(2,0)*w[11] - a(2,1)*w[ 5] + a(2,2)*w[ 2] - a(2,4)*w[ 0];
 
 3332          v[ 2] = a(2,0)*w[12] - a(2,1)*w[ 6] + a(2,2)*w[ 3] - a(2,5)*w[ 0];
 
 3333          v[ 3] = a(2,0)*w[13] - a(2,1)*w[ 7] + a(2,3)*w[ 2] - a(2,4)*w[ 1];
 
 3334          v[ 4] = a(2,0)*w[14] - a(2,1)*w[ 8] + a(2,3)*w[ 3] - a(2,5)*w[ 1];
 
 3335          v[ 5] = a(2,0)*w[15] - a(2,1)*w[ 9] + a(2,4)*w[ 3] - a(2,5)*w[ 2];
 
 3336          v[ 6] = a(2,0)*w[16] - a(2,2)*w[ 7] + a(2,3)*w[ 5] - a(2,4)*w[ 4];
 
 3337          v[ 7] = a(2,0)*w[17] - a(2,2)*w[ 8] + a(2,3)*w[ 6] - a(2,5)*w[ 4];
 
 3338          v[ 8] = a(2,0)*w[18] - a(2,2)*w[ 9] + a(2,4)*w[ 6] - a(2,5)*w[ 5];
 
 3339          v[ 9] = a(2,0)*w[19] - a(2,3)*w[ 9] + a(2,4)*w[ 8] - a(2,5)*w[ 7];
 
 3340          v[10] = a(2,1)*w[16] - a(2,2)*w[13] + a(2,3)*w[11] - a(2,4)*w[10];
 
 3341          v[11] = a(2,1)*w[17] - a(2,2)*w[14] + a(2,3)*w[12] - a(2,5)*w[10];
 
 3342          v[12] = a(2,1)*w[18] - a(2,2)*w[15] + a(2,4)*w[12] - a(2,5)*w[11];
 
 3343          v[13] = a(2,1)*w[19] - a(2,3)*w[15] + a(2,4)*w[14] - a(2,5)*w[13];
 
 3344          v[14] = a(2,2)*w[19] - a(2,3)*w[18] + a(2,4)*w[17] - a(2,5)*w[16];
 
 3347            + a(0,0)*(a(1,1)*v[14] - a(1,2)*v[13] + a(1,3)*v[12] - a(1,4)*v[11] + a(1,5)*v[10])
 
 3348            - a(0,1)*(a(1,0)*v[14] - a(1,2)*v[ 9] + a(1,3)*v[ 8] - a(1,4)*v[ 7] + a(1,5)*v[ 6])
 
 3349            + a(0,2)*(a(1,0)*v[13] - a(1,1)*v[ 9] + a(1,3)*v[ 5] - a(1,4)*v[ 4] + a(1,5)*v[ 3])
 
 3350            - a(0,3)*(a(1,0)*v[12] - a(1,1)*v[ 8] + a(1,2)*v[ 5] - a(1,4)*v[ 2] + a(1,5)*v[ 1])
 
 3351            + a(0,4)*(a(1,0)*v[11] - a(1,1)*v[ 7] + a(1,2)*v[ 4] - a(1,3)*v[ 2] + a(1,5)*v[ 0])
 
 3352            - a(0,5)*(a(1,0)*v[10] - a(1,1)*v[ 6] + a(1,2)*v[ 3] - a(1,3)*v[ 1] + a(1,4)*v[ 0]);
 
 3362      template<
int m_, 
int n_>
 
 3365        template<
typename T_, 
int sma_, 
int sna_>
 
 3366        CUDA_HOST_DEVICE 
static T_ compute(
const Tiny::Matrix<T_, n_, n_, sma_, sna_>& a)
 
 3369          Tiny::Matrix<T_, n_, n_> b;
 
 3370          for(
int i(0); i < n_; ++i)
 
 3372            for(
int j(0); j < n_; ++j)
 
 3375              for(
int k(0); k < m_; ++k)
 
 3377                b(i,j) += a(k,i)*a(k,j);
 
 3382          return Math::sqrt(DetHelper<n_,n_>::compute(b));
 
 3384          return CudaMath::cuda_sqrt(DetHelper<n_,n_>::compute(b));
 
 3390      struct VolHelper<n_,n_>
 
 3392        template<
typename T_, 
int sma_, 
int sna_>
 
 3393        CUDA_HOST_DEVICE 
static T_ compute(
const Tiny::Matrix<T_, n_, n_, sma_, sna_>& a)
 
 3397          return Math::abs(DetHelper<n_,n_>::compute(a));
 
 3399          return CudaMath::cuda_abs(DetHelper<n_,n_>::compute(a));
 
 3405      struct VolHelper<2,1>
 
 3407        template<
typename T_, 
int sma_, 
int sna_>
 
 3408        CUDA_HOST_DEVICE 
static T_ compute(
const Tiny::Matrix<T_, 2, 1, sma_, sna_>& a)
 
 3412          return Math::sqrt(Math::sqr(a(0,0)) + Math::sqr(a(1,0)));
 
 3414          return CudaMath::cuda_sqrt(CudaMath::cuda_sqr(a(0,0)) + CudaMath::cuda_sqr(a(1,0)));
 
 3420      struct VolHelper<3,1>
 
 3422        template<
typename T_, 
int sma_, 
int sna_>
 
 3423        CUDA_HOST_DEVICE 
static T_ compute(
const Tiny::Matrix<T_, 3, 1, sma_, sna_>& a)
 
 3427          return Math::sqrt(Math::sqr(a(0,0)) + Math::sqr(a(1,0)) + Math::sqr(a(2,0)));
 
 3429          return CudaMath::cuda_sqrt(CudaMath::cuda_sqr(a(0,0)) + CudaMath::cuda_sqr(a(1,0)) + CudaMath::cuda_sqr(a(2,0)));
 
 3435      struct VolHelper<3,2>
 
 3437        template<
typename T_, 
int sma_, 
int sna_>
 
 3438        CUDA_HOST_DEVICE 
static T_ compute(
const Tiny::Matrix<T_, 3, 2, sma_, sna_>& a)
 
 3443            Math::sqr(a(1,0)*a(2,1) - a(2,0)*a(1,1)) +
 
 3444            Math::sqr(a(2,0)*a(0,1) - a(0,0)*a(2,1)) +
 
 3445            Math::sqr(a(0,0)*a(1,1) - a(1,0)*a(0,1)));
 
 3447          return CudaMath::cuda_sqrt(
 
 3448            CudaMath::cuda_sqr(a(1,0)*a(2,1) - a(2,0)*a(1,1)) +
 
 3449            CudaMath::cuda_sqr(a(2,0)*a(0,1) - a(0,0)*a(2,1)) +
 
 3450            CudaMath::cuda_sqr(a(0,0)*a(1,1) - a(1,0)*a(0,1)));
 
 3462      struct InverseHelper<1,1>
 
 3464        template<
typename T_, 
int smb_, 
int snb_, 
int sma_, 
int sna_>
 
 3465        CUDA_HOST_DEVICE 
static T_ compute(Tiny::Matrix<T_, 1, 1, smb_, snb_>& b, 
const Tiny::Matrix<T_, 1, 1, sma_, sna_>& a)
 
 3467          b(0,0) = T_(1) / a(0,0);
 
 3473      struct InverseHelper<2,2>
 
 3475        template<
typename T_, 
int smb_, 
int snb_, 
int sma_, 
int sna_>
 
 3476        CUDA_HOST_DEVICE 
static T_ compute(Tiny::Matrix<T_, 2, 2, smb_, snb_>& b, 
const Tiny::Matrix<T_, 2, 2, sma_, sna_>& a)
 
 3478          T_ det = a(0,0)*a(1,1) - a(0,1)*a(1,0);
 
 3489      struct InverseHelper<3,3>
 
 3491        template<
typename T_, 
int smb_, 
int snb_, 
int sma_, 
int sna_>
 
 3492        CUDA_HOST_DEVICE 
static T_ compute(Tiny::Matrix<T_, 3, 3, smb_, snb_>& b, 
const Tiny::Matrix<T_, 3, 3, sma_, sna_>& a)
 
 3494          b(0,0) = a(1,1)*a(2,2) - a(1,2)*a(2,1);
 
 3495          b(1,0) = a(1,2)*a(2,0) - a(1,0)*a(2,2);
 
 3496          b(2,0) = a(1,0)*a(2,1) - a(1,1)*a(2,0);
 
 3497          T_ det = a(0,0)*b(0,0) + a(0,1)*b(1,0) + a(0,2)*b(2,0);
 
 3502          b(0,1) = d*(a(0,2)*a(2,1) - a(0,1)*a(2,2));
 
 3503          b(1,1) = d*(a(0,0)*a(2,2) - a(0,2)*a(2,0));
 
 3504          b(2,1) = d*(a(0,1)*a(2,0) - a(0,0)*a(2,1));
 
 3505          b(0,2) = d*(a(0,1)*a(1,2) - a(0,2)*a(1,1));
 
 3506          b(1,2) = d*(a(0,2)*a(1,0) - a(0,0)*a(1,2));
 
 3507          b(2,2) = d*(a(0,0)*a(1,1) - a(0,1)*a(1,0));
 
 3513      struct InverseHelper<4,4>
 
 3515        template<
typename T_, 
int smb_, 
int snb_, 
int sma_, 
int sna_>
 
 3516        CUDA_HOST_DEVICE 
static T_ compute(Tiny::Matrix<T_, 4, 4, smb_, snb_>& b, 
const Tiny::Matrix<T_, 4, 4, sma_, sna_>& a)
 
 3519          w[0] = a(2,0)*a(3,1)-a(2,1)*a(3,0);
 
 3520          w[1] = a(2,0)*a(3,2)-a(2,2)*a(3,0);
 
 3521          w[2] = a(2,0)*a(3,3)-a(2,3)*a(3,0);
 
 3522          w[3] = a(2,1)*a(3,2)-a(2,2)*a(3,1);
 
 3523          w[4] = a(2,1)*a(3,3)-a(2,3)*a(3,1);
 
 3524          w[5] = a(2,2)*a(3,3)-a(2,3)*a(3,2);
 
 3525          b(0,0) = a(1,1)*w[5]-a(1,2)*w[4]+a(1,3)*w[3];
 
 3526          b(1,0) =-a(1,0)*w[5]+a(1,2)*w[2]-a(1,3)*w[1];
 
 3527          b(2,0) = a(1,0)*w[4]-a(1,1)*w[2]+a(1,3)*w[0];
 
 3528          b(3,0) =-a(1,0)*w[3]+a(1,1)*w[1]-a(1,2)*w[0];
 
 3529          T_ det = a(0,0)*b(0,0)+a(0,1)*b(1,0)+a(0,2)*b(2,0)+a(0,3)*b(3,0);
 
 3535          b(0,1) = d*(-a(0,1)*w[5]+a(0,2)*w[4]-a(0,3)*w[3]);
 
 3536          b(1,1) = d*( a(0,0)*w[5]-a(0,2)*w[2]+a(0,3)*w[1]);
 
 3537          b(2,1) = d*(-a(0,0)*w[4]+a(0,1)*w[2]-a(0,3)*w[0]);
 
 3538          b(3,1) = d*( a(0,0)*w[3]-a(0,1)*w[1]+a(0,2)*w[0]);
 
 3539          w[0] = a(0,0)*a(1,1)-a(0,1)*a(1,0);
 
 3540          w[1] = a(0,0)*a(1,2)-a(0,2)*a(1,0);
 
 3541          w[2] = a(0,0)*a(1,3)-a(0,3)*a(1,0);
 
 3542          w[3] = a(0,1)*a(1,2)-a(0,2)*a(1,1);
 
 3543          w[4] = a(0,1)*a(1,3)-a(0,3)*a(1,1);
 
 3544          w[5] = a(0,2)*a(1,3)-a(0,3)*a(1,2);
 
 3545          b(0,2) = d*( a(3,1)*w[5]-a(3,2)*w[4]+a(3,3)*w[3]);
 
 3546          b(1,2) = d*(-a(3,0)*w[5]+a(3,2)*w[2]-a(3,3)*w[1]);
 
 3547          b(2,2) = d*( a(3,0)*w[4]-a(3,1)*w[2]+a(3,3)*w[0]);
 
 3548          b(3,2) = d*(-a(3,0)*w[3]+a(3,1)*w[1]-a(3,2)*w[0]);
 
 3549          b(0,3) = d*(-a(2,1)*w[5]+a(2,2)*w[4]-a(2,3)*w[3]);
 
 3550          b(1,3) = d*( a(2,0)*w[5]-a(2,2)*w[2]+a(2,3)*w[1]);
 
 3551          b(2,3) = d*(-a(2,0)*w[4]+a(2,1)*w[2]-a(2,3)*w[0]);
 
 3552          b(3,3) = d*( a(2,0)*w[3]-a(2,1)*w[1]+a(2,2)*w[0]);
 
 3558      struct InverseHelper<5,5>
 
 3560        template<
typename T_, 
int smb_, 
int snb_, 
int sma_, 
int sna_>
 
 3561        CUDA_HOST_DEVICE 
static T_ compute(Tiny::Matrix<T_, 5, 5, smb_, snb_>& b, 
const Tiny::Matrix<T_, 5, 5, sma_, sna_>& a)
 
 3564          w[ 0] = a(3,0)*a(4,1)-a(3,1)*a(4,0);
 
 3565          w[ 1] = a(3,0)*a(4,2)-a(3,2)*a(4,0);
 
 3566          w[ 2] = a(3,0)*a(4,3)-a(3,3)*a(4,0);
 
 3567          w[ 3] = a(3,0)*a(4,4)-a(3,4)*a(4,0);
 
 3568          w[ 4] = a(3,1)*a(4,2)-a(3,2)*a(4,1);
 
 3569          w[ 5] = a(3,1)*a(4,3)-a(3,3)*a(4,1);
 
 3570          w[ 6] = a(3,1)*a(4,4)-a(3,4)*a(4,1);
 
 3571          w[ 7] = a(3,2)*a(4,3)-a(3,3)*a(4,2);
 
 3572          w[ 8] = a(3,2)*a(4,4)-a(3,4)*a(4,2);
 
 3573          w[ 9] = a(3,3)*a(4,4)-a(3,4)*a(4,3);
 
 3574          w[10] = a(2,0)*w[4]-a(2,1)*w[1]+a(2,2)*w[0];
 
 3575          w[11] = a(2,0)*w[5]-a(2,1)*w[2]+a(2,3)*w[0];
 
 3576          w[12] = a(2,0)*w[6]-a(2,1)*w[3]+a(2,4)*w[0];
 
 3577          w[13] = a(2,0)*w[7]-a(2,2)*w[2]+a(2,3)*w[1];
 
 3578          w[14] = a(2,0)*w[8]-a(2,2)*w[3]+a(2,4)*w[1];
 
 3579          w[15] = a(2,0)*w[9]-a(2,3)*w[3]+a(2,4)*w[2];
 
 3580          w[16] = a(2,1)*w[7]-a(2,2)*w[5]+a(2,3)*w[4];
 
 3581          w[17] = a(2,1)*w[8]-a(2,2)*w[6]+a(2,4)*w[4];
 
 3582          w[18] = a(2,1)*w[9]-a(2,3)*w[6]+a(2,4)*w[5];
 
 3583          w[19] = a(2,2)*w[9]-a(2,3)*w[8]+a(2,4)*w[7];
 
 3584          b(0,0) = a(1,1)*w[19]-a(1,2)*w[18]+a(1,3)*w[17]-a(1,4)*w[16];
 
 3585          b(1,0) =-a(1,0)*w[19]+a(1,2)*w[15]-a(1,3)*w[14]+a(1,4)*w[13];
 
 3586          b(2,0) = a(1,0)*w[18]-a(1,1)*w[15]+a(1,3)*w[12]-a(1,4)*w[11];
 
 3587          b(3,0) =-a(1,0)*w[17]+a(1,1)*w[14]-a(1,2)*w[12]+a(1,4)*w[10];
 
 3588          b(4,0) = a(1,0)*w[16]-a(1,1)*w[13]+a(1,2)*w[11]-a(1,3)*w[10];
 
 3589          T_ det = a(0,0)*b(0,0)+a(0,1)*b(1,0)+a(0,2)*b(2,0)+a(0,3)*b(3,0)+a(0,4)*b(4,0);
 
 3596          b(0,1) = d*(-a(0,1)*w[19]+a(0,2)*w[18]-a(0,3)*w[17]+a(0,4)*w[16]);
 
 3597          b(1,1) = d*( a(0,0)*w[19]-a(0,2)*w[15]+a(0,3)*w[14]-a(0,4)*w[13]);
 
 3598          b(2,1) = d*(-a(0,0)*w[18]+a(0,1)*w[15]-a(0,3)*w[12]+a(0,4)*w[11]);
 
 3599          b(3,1) = d*( a(0,0)*w[17]-a(0,1)*w[14]+a(0,2)*w[12]-a(0,4)*w[10]);
 
 3600          b(4,1) = d*(-a(0,0)*w[16]+a(0,1)*w[13]-a(0,2)*w[11]+a(0,3)*w[10]);
 
 3601          w[10] = a(1,0)*w[4]-a(1,1)*w[1]+a(1,2)*w[0];
 
 3602          w[11] = a(1,0)*w[5]-a(1,1)*w[2]+a(1,3)*w[0];
 
 3603          w[12] = a(1,0)*w[6]-a(1,1)*w[3]+a(1,4)*w[0];
 
 3604          w[13] = a(1,0)*w[7]-a(1,2)*w[2]+a(1,3)*w[1];
 
 3605          w[14] = a(1,0)*w[8]-a(1,2)*w[3]+a(1,4)*w[1];
 
 3606          w[15] = a(1,0)*w[9]-a(1,3)*w[3]+a(1,4)*w[2];
 
 3607          w[16] = a(1,1)*w[7]-a(1,2)*w[5]+a(1,3)*w[4];
 
 3608          w[17] = a(1,1)*w[8]-a(1,2)*w[6]+a(1,4)*w[4];
 
 3609          w[18] = a(1,1)*w[9]-a(1,3)*w[6]+a(1,4)*w[5];
 
 3610          w[19] = a(1,2)*w[9]-a(1,3)*w[8]+a(1,4)*w[7];
 
 3611          b(0,2) = d*( a(0,1)*w[19]-a(0,2)*w[18]+a(0,3)*w[17]-a(0,4)*w[16]);
 
 3612          b(1,2) = d*(-a(0,0)*w[19]+a(0,2)*w[15]-a(0,3)*w[14]+a(0,4)*w[13]);
 
 3613          b(2,2) = d*( a(0,0)*w[18]-a(0,1)*w[15]+a(0,3)*w[12]-a(0,4)*w[11]);
 
 3614          b(3,2) = d*(-a(0,0)*w[17]+a(0,1)*w[14]-a(0,2)*w[12]+a(0,4)*w[10]);
 
 3615          b(4,2) = d*( a(0,0)*w[16]-a(0,1)*w[13]+a(0,2)*w[11]-a(0,3)*w[10]);
 
 3616          w[ 0] = a(0,0)*a(1,1)-a(0,1)*a(1,0);
 
 3617          w[ 1] = a(0,0)*a(1,2)-a(0,2)*a(1,0);
 
 3618          w[ 2] = a(0,0)*a(1,3)-a(0,3)*a(1,0);
 
 3619          w[ 3] = a(0,0)*a(1,4)-a(0,4)*a(1,0);
 
 3620          w[ 4] = a(0,1)*a(1,2)-a(0,2)*a(1,1);
 
 3621          w[ 5] = a(0,1)*a(1,3)-a(0,3)*a(1,1);
 
 3622          w[ 6] = a(0,1)*a(1,4)-a(0,4)*a(1,1);
 
 3623          w[ 7] = a(0,2)*a(1,3)-a(0,3)*a(1,2);
 
 3624          w[ 8] = a(0,2)*a(1,4)-a(0,4)*a(1,2);
 
 3625          w[ 9] = a(0,3)*a(1,4)-a(0,4)*a(1,3);
 
 3626          w[10] = a(2,0)*w[4]-a(2,1)*w[1]+a(2,2)*w[0];
 
 3627          w[11] = a(2,0)*w[5]-a(2,1)*w[2]+a(2,3)*w[0];
 
 3628          w[12] = a(2,0)*w[6]-a(2,1)*w[3]+a(2,4)*w[0];
 
 3629          w[13] = a(2,0)*w[7]-a(2,2)*w[2]+a(2,3)*w[1];
 
 3630          w[14] = a(2,0)*w[8]-a(2,2)*w[3]+a(2,4)*w[1];
 
 3631          w[15] = a(2,0)*w[9]-a(2,3)*w[3]+a(2,4)*w[2];
 
 3632          w[16] = a(2,1)*w[7]-a(2,2)*w[5]+a(2,3)*w[4];
 
 3633          w[17] = a(2,1)*w[8]-a(2,2)*w[6]+a(2,4)*w[4];
 
 3634          w[18] = a(2,1)*w[9]-a(2,3)*w[6]+a(2,4)*w[5];
 
 3635          w[19] = a(2,2)*w[9]-a(2,3)*w[8]+a(2,4)*w[7];
 
 3636          b(0,3) = d*( a(4,1)*w[19]-a(4,2)*w[18]+a(4,3)*w[17]-a(4,4)*w[16]);
 
 3637          b(1,3) = d*(-a(4,0)*w[19]+a(4,2)*w[15]-a(4,3)*w[14]+a(4,4)*w[13]);
 
 3638          b(2,3) = d*( a(4,0)*w[18]-a(4,1)*w[15]+a(4,3)*w[12]-a(4,4)*w[11]);
 
 3639          b(3,3) = d*(-a(4,0)*w[17]+a(4,1)*w[14]-a(4,2)*w[12]+a(4,4)*w[10]);
 
 3640          b(4,3) = d*( a(4,0)*w[16]-a(4,1)*w[13]+a(4,2)*w[11]-a(4,3)*w[10]);
 
 3641          b(0,4) = d*(-a(3,1)*w[19]+a(3,2)*w[18]-a(3,3)*w[17]+a(3,4)*w[16]);
 
 3642          b(1,4) = d*( a(3,0)*w[19]-a(3,2)*w[15]+a(3,3)*w[14]-a(3,4)*w[13]);
 
 3643          b(2,4) = d*(-a(3,0)*w[18]+a(3,1)*w[15]-a(3,3)*w[12]+a(3,4)*w[11]);
 
 3644          b(3,4) = d*( a(3,0)*w[17]-a(3,1)*w[14]+a(3,2)*w[12]-a(3,4)*w[10]);
 
 3645          b(4,4) = d*(-a(3,0)*w[16]+a(3,1)*w[13]-a(3,2)*w[11]+a(3,3)*w[10]);
 
 3651      struct InverseHelper<6,6>
 
 3653        template<
typename T_, 
int smb_, 
int snb_, 
int sma_, 
int sna_>
 
 3654        CUDA_HOST_DEVICE 
static T_ compute(Tiny::Matrix<T_, 6, 6, smb_, snb_>& b, 
const Tiny::Matrix<T_, 6, 6, sma_, sna_>& a)
 
 3657          w[ 0] = a(4,0)*a(5,1)-a(4,1)*a(5,0);
 
 3658          w[ 1] = a(4,0)*a(5,2)-a(4,2)*a(5,0);
 
 3659          w[ 2] = a(4,0)*a(5,3)-a(4,3)*a(5,0);
 
 3660          w[ 3] = a(4,0)*a(5,4)-a(4,4)*a(5,0);
 
 3661          w[ 4] = a(4,0)*a(5,5)-a(4,5)*a(5,0);
 
 3662          w[ 5] = a(4,1)*a(5,2)-a(4,2)*a(5,1);
 
 3663          w[ 6] = a(4,1)*a(5,3)-a(4,3)*a(5,1);
 
 3664          w[ 7] = a(4,1)*a(5,4)-a(4,4)*a(5,1);
 
 3665          w[ 8] = a(4,1)*a(5,5)-a(4,5)*a(5,1);
 
 3666          w[ 9] = a(4,2)*a(5,3)-a(4,3)*a(5,2);
 
 3667          w[10] = a(4,2)*a(5,4)-a(4,4)*a(5,2);
 
 3668          w[11] = a(4,2)*a(5,5)-a(4,5)*a(5,2);
 
 3669          w[12] = a(4,3)*a(5,4)-a(4,4)*a(5,3);
 
 3670          w[13] = a(4,3)*a(5,5)-a(4,5)*a(5,3);
 
 3671          w[14] = a(4,4)*a(5,5)-a(4,5)*a(5,4);
 
 3672          w[15] = a(3,0)*w[5]-a(3,1)*w[1]+a(3,2)*w[0];
 
 3673          w[16] = a(3,0)*w[6]-a(3,1)*w[2]+a(3,3)*w[0];
 
 3674          w[17] = a(3,0)*w[7]-a(3,1)*w[3]+a(3,4)*w[0];
 
 3675          w[18] = a(3,0)*w[8]-a(3,1)*w[4]+a(3,5)*w[0];
 
 3676          w[19] = a(3,0)*w[9]-a(3,2)*w[2]+a(3,3)*w[1];
 
 3677          w[20] = a(3,0)*w[10]-a(3,2)*w[3]+a(3,4)*w[1];
 
 3678          w[21] = a(3,0)*w[11]-a(3,2)*w[4]+a(3,5)*w[1];
 
 3679          w[22] = a(3,0)*w[12]-a(3,3)*w[3]+a(3,4)*w[2];
 
 3680          w[23] = a(3,0)*w[13]-a(3,3)*w[4]+a(3,5)*w[2];
 
 3681          w[24] = a(3,0)*w[14]-a(3,4)*w[4]+a(3,5)*w[3];
 
 3682          w[25] = a(3,1)*w[9]-a(3,2)*w[6]+a(3,3)*w[5];
 
 3683          w[26] = a(3,1)*w[10]-a(3,2)*w[7]+a(3,4)*w[5];
 
 3684          w[27] = a(3,1)*w[11]-a(3,2)*w[8]+a(3,5)*w[5];
 
 3685          w[28] = a(3,1)*w[12]-a(3,3)*w[7]+a(3,4)*w[6];
 
 3686          w[29] = a(3,1)*w[13]-a(3,3)*w[8]+a(3,5)*w[6];
 
 3687          w[30] = a(3,1)*w[14]-a(3,4)*w[8]+a(3,5)*w[7];
 
 3688          w[31] = a(3,2)*w[12]-a(3,3)*w[10]+a(3,4)*w[9];
 
 3689          w[32] = a(3,2)*w[13]-a(3,3)*w[11]+a(3,5)*w[9];
 
 3690          w[33] = a(3,2)*w[14]-a(3,4)*w[11]+a(3,5)*w[10];
 
 3691          w[34] = a(3,3)*w[14]-a(3,4)*w[13]+a(3,5)*w[12];
 
 3692          w[ 0] = a(2,0)*w[25]-a(2,1)*w[19]+a(2,2)*w[16]-a(2,3)*w[15];
 
 3693          w[ 1] = a(2,0)*w[26]-a(2,1)*w[20]+a(2,2)*w[17]-a(2,4)*w[15];
 
 3694          w[ 2] = a(2,0)*w[27]-a(2,1)*w[21]+a(2,2)*w[18]-a(2,5)*w[15];
 
 3695          w[ 3] = a(2,0)*w[28]-a(2,1)*w[22]+a(2,3)*w[17]-a(2,4)*w[16];
 
 3696          w[ 4] = a(2,0)*w[29]-a(2,1)*w[23]+a(2,3)*w[18]-a(2,5)*w[16];
 
 3697          w[ 5] = a(2,0)*w[30]-a(2,1)*w[24]+a(2,4)*w[18]-a(2,5)*w[17];
 
 3698          w[ 6] = a(2,0)*w[31]-a(2,2)*w[22]+a(2,3)*w[20]-a(2,4)*w[19];
 
 3699          w[ 7] = a(2,0)*w[32]-a(2,2)*w[23]+a(2,3)*w[21]-a(2,5)*w[19];
 
 3700          w[ 8] = a(2,0)*w[33]-a(2,2)*w[24]+a(2,4)*w[21]-a(2,5)*w[20];
 
 3701          w[ 9] = a(2,0)*w[34]-a(2,3)*w[24]+a(2,4)*w[23]-a(2,5)*w[22];
 
 3702          w[10] = a(2,1)*w[31]-a(2,2)*w[28]+a(2,3)*w[26]-a(2,4)*w[25];
 
 3703          w[11] = a(2,1)*w[32]-a(2,2)*w[29]+a(2,3)*w[27]-a(2,5)*w[25];
 
 3704          w[12] = a(2,1)*w[33]-a(2,2)*w[30]+a(2,4)*w[27]-a(2,5)*w[26];
 
 3705          w[13] = a(2,1)*w[34]-a(2,3)*w[30]+a(2,4)*w[29]-a(2,5)*w[28];
 
 3706          w[14] = a(2,2)*w[34]-a(2,3)*w[33]+a(2,4)*w[32]-a(2,5)*w[31];
 
 3707          b(0,0) =  a(1,1)*w[14]-a(1,2)*w[13]+a(1,3)*w[12]-a(1,4)*w[11]+a(1,5)*w[10];
 
 3708          b(1,0) = -a(1,0)*w[14]+a(1,2)*w[9]-a(1,3)*w[8]+a(1,4)*w[7]-a(1,5)*w[6];
 
 3709          b(2,0) =  a(1,0)*w[13]-a(1,1)*w[9]+a(1,3)*w[5]-a(1,4)*w[4]+a(1,5)*w[3];
 
 3710          b(3,0) = -a(1,0)*w[12]+a(1,1)*w[8]-a(1,2)*w[5]+a(1,4)*w[2]-a(1,5)*w[1];
 
 3711          b(4,0) =  a(1,0)*w[11]-a(1,1)*w[7]+a(1,2)*w[4]-a(1,3)*w[2]+a(1,5)*w[0];
 
 3712          b(5,0) = -a(1,0)*w[10]+a(1,1)*w[6]-a(1,2)*w[3]+a(1,3)*w[1]-a(1,4)*w[0];
 
 3713          T_ det = a(0,0)*b(0,0) + a(0,1)*b(1,0) + a(0,2)*b(2,0)
 
 3714            + a(0,3)*b(3,0) + a(0,4)*b(4,0) + a(0,5)*b(5,0);
 
 3722          b(0,1) = d*(-a(0,1)*w[14]+a(0,2)*w[13]-a(0,3)*w[12]+a(0,4)*w[11]-a(0,5)*w[10]);
 
 3723          b(1,1) = d*( a(0,0)*w[14]-a(0,2)*w[9]+a(0,3)*w[8]-a(0,4)*w[7]+a(0,5)*w[6]);
 
 3724          b(2,1) = d*(-a(0,0)*w[13]+a(0,1)*w[9]-a(0,3)*w[5]+a(0,4)*w[4]-a(0,5)*w[3]);
 
 3725          b(3,1) = d*( a(0,0)*w[12]-a(0,1)*w[8]+a(0,2)*w[5]-a(0,4)*w[2]+a(0,5)*w[1]);
 
 3726          b(4,1) = d*(-a(0,0)*w[11]+a(0,1)*w[7]-a(0,2)*w[4]+a(0,3)*w[2]-a(0,5)*w[0]);
 
 3727          b(5,1) = d*( a(0,0)*w[10]-a(0,1)*w[6]+a(0,2)*w[3]-a(0,3)*w[1]+a(0,4)*w[0]);
 
 3728          w[ 0] = a(4,0)*a(5,1)-a(4,1)*a(5,0);
 
 3729          w[ 1] = a(4,0)*a(5,2)-a(4,2)*a(5,0);
 
 3730          w[ 2] = a(4,0)*a(5,3)-a(4,3)*a(5,0);
 
 3731          w[ 3] = a(4,0)*a(5,4)-a(4,4)*a(5,0);
 
 3732          w[ 4] = a(4,0)*a(5,5)-a(4,5)*a(5,0);
 
 3733          w[ 5] = a(4,1)*a(5,2)-a(4,2)*a(5,1);
 
 3734          w[ 6] = a(4,1)*a(5,3)-a(4,3)*a(5,1);
 
 3735          w[ 7] = a(4,1)*a(5,4)-a(4,4)*a(5,1);
 
 3736          w[ 8] = a(4,1)*a(5,5)-a(4,5)*a(5,1);
 
 3737          w[ 9] = a(4,2)*a(5,3)-a(4,3)*a(5,2);
 
 3738          w[10] = a(4,2)*a(5,4)-a(4,4)*a(5,2);
 
 3739          w[11] = a(4,2)*a(5,5)-a(4,5)*a(5,2);
 
 3740          w[12] = a(4,3)*a(5,4)-a(4,4)*a(5,3);
 
 3741          w[13] = a(4,3)*a(5,5)-a(4,5)*a(5,3);
 
 3742          w[14] = a(4,4)*a(5,5)-a(4,5)*a(5,4);
 
 3743          w[15] = a(1,0)*w[5]-a(1,1)*w[1]+a(1,2)*w[0];
 
 3744          w[16] = a(1,0)*w[6]-a(1,1)*w[2]+a(1,3)*w[0];
 
 3745          w[17] = a(1,0)*w[7]-a(1,1)*w[3]+a(1,4)*w[0];
 
 3746          w[18] = a(1,0)*w[8]-a(1,1)*w[4]+a(1,5)*w[0];
 
 3747          w[19] = a(1,0)*w[9]-a(1,2)*w[2]+a(1,3)*w[1];
 
 3748          w[20] = a(1,0)*w[10]-a(1,2)*w[3]+a(1,4)*w[1];
 
 3749          w[21] = a(1,0)*w[11]-a(1,2)*w[4]+a(1,5)*w[1];
 
 3750          w[22] = a(1,0)*w[12]-a(1,3)*w[3]+a(1,4)*w[2];
 
 3751          w[23] = a(1,0)*w[13]-a(1,3)*w[4]+a(1,5)*w[2];
 
 3752          w[24] = a(1,0)*w[14]-a(1,4)*w[4]+a(1,5)*w[3];
 
 3753          w[25] = a(1,1)*w[9]-a(1,2)*w[6]+a(1,3)*w[5];
 
 3754          w[26] = a(1,1)*w[10]-a(1,2)*w[7]+a(1,4)*w[5];
 
 3755          w[27] = a(1,1)*w[11]-a(1,2)*w[8]+a(1,5)*w[5];
 
 3756          w[28] = a(1,1)*w[12]-a(1,3)*w[7]+a(1,4)*w[6];
 
 3757          w[29] = a(1,1)*w[13]-a(1,3)*w[8]+a(1,5)*w[6];
 
 3758          w[30] = a(1,1)*w[14]-a(1,4)*w[8]+a(1,5)*w[7];
 
 3759          w[31] = a(1,2)*w[12]-a(1,3)*w[10]+a(1,4)*w[9];
 
 3760          w[32] = a(1,2)*w[13]-a(1,3)*w[11]+a(1,5)*w[9];
 
 3761          w[33] = a(1,2)*w[14]-a(1,4)*w[11]+a(1,5)*w[10];
 
 3762          w[34] = a(1,3)*w[14]-a(1,4)*w[13]+a(1,5)*w[12];
 
 3763          w[ 0] = a(0,0)*w[25]-a(0,1)*w[19]+a(0,2)*w[16]-a(0,3)*w[15];
 
 3764          w[ 1] = a(0,0)*w[26]-a(0,1)*w[20]+a(0,2)*w[17]-a(0,4)*w[15];
 
 3765          w[ 2] = a(0,0)*w[27]-a(0,1)*w[21]+a(0,2)*w[18]-a(0,5)*w[15];
 
 3766          w[ 3] = a(0,0)*w[28]-a(0,1)*w[22]+a(0,3)*w[17]-a(0,4)*w[16];
 
 3767          w[ 4] = a(0,0)*w[29]-a(0,1)*w[23]+a(0,3)*w[18]-a(0,5)*w[16];
 
 3768          w[ 5] = a(0,0)*w[30]-a(0,1)*w[24]+a(0,4)*w[18]-a(0,5)*w[17];
 
 3769          w[ 6] = a(0,0)*w[31]-a(0,2)*w[22]+a(0,3)*w[20]-a(0,4)*w[19];
 
 3770          w[ 7] = a(0,0)*w[32]-a(0,2)*w[23]+a(0,3)*w[21]-a(0,5)*w[19];
 
 3771          w[ 8] = a(0,0)*w[33]-a(0,2)*w[24]+a(0,4)*w[21]-a(0,5)*w[20];
 
 3772          w[ 9] = a(0,0)*w[34]-a(0,3)*w[24]+a(0,4)*w[23]-a(0,5)*w[22];
 
 3773          w[10] = a(0,1)*w[31]-a(0,2)*w[28]+a(0,3)*w[26]-a(0,4)*w[25];
 
 3774          w[11] = a(0,1)*w[32]-a(0,2)*w[29]+a(0,3)*w[27]-a(0,5)*w[25];
 
 3775          w[12] = a(0,1)*w[33]-a(0,2)*w[30]+a(0,4)*w[27]-a(0,5)*w[26];
 
 3776          w[13] = a(0,1)*w[34]-a(0,3)*w[30]+a(0,4)*w[29]-a(0,5)*w[28];
 
 3777          w[14] = a(0,2)*w[34]-a(0,3)*w[33]+a(0,4)*w[32]-a(0,5)*w[31];
 
 3778          b(0,2) = d*( a(3,1)*w[14]-a(3,2)*w[13]+a(3,3)*w[12]-a(3,4)*w[11]+a(3,5)*w[10]);
 
 3779          b(1,2) = d*(-a(3,0)*w[14]+a(3,2)*w[9]-a(3,3)*w[8]+a(3,4)*w[7]-a(3,5)*w[6]);
 
 3780          b(2,2) = d*( a(3,0)*w[13]-a(3,1)*w[9]+a(3,3)*w[5]-a(3,4)*w[4]+a(3,5)*w[3]);
 
 3781          b(3,2) = d*(-a(3,0)*w[12]+a(3,1)*w[8]-a(3,2)*w[5]+a(3,4)*w[2]-a(3,5)*w[1]);
 
 3782          b(4,2) = d*( a(3,0)*w[11]-a(3,1)*w[7]+a(3,2)*w[4]-a(3,3)*w[2]+a(3,5)*w[0]);
 
 3783          b(5,2) = d*(-a(3,0)*w[10]+a(3,1)*w[6]-a(3,2)*w[3]+a(3,3)*w[1]-a(3,4)*w[0]);
 
 3784          b(0,3) = d*(-a(2,1)*w[14]+a(2,2)*w[13]-a(2,3)*w[12]+a(2,4)*w[11]-a(2,5)*w[10]);
 
 3785          b(1,3) = d*( a(2,0)*w[14]-a(2,2)*w[9]+a(2,3)*w[8]-a(2,4)*w[7]+a(2,5)*w[6]);
 
 3786          b(2,3) = d*(-a(2,0)*w[13]+a(2,1)*w[9]-a(2,3)*w[5]+a(2,4)*w[4]-a(2,5)*w[3]);
 
 3787          b(3,3) = d*( a(2,0)*w[12]-a(2,1)*w[8]+a(2,2)*w[5]-a(2,4)*w[2]+a(2,5)*w[1]);
 
 3788          b(4,3) = d*(-a(2,0)*w[11]+a(2,1)*w[7]-a(2,2)*w[4]+a(2,3)*w[2]-a(2,5)*w[0]);
 
 3789          b(5,3) = d*( a(2,0)*w[10]-a(2,1)*w[6]+a(2,2)*w[3]-a(2,3)*w[1]+a(2,4)*w[0]);
 
 3790          w[ 0] = a(2,0)*a(3,1)-a(2,1)*a(3,0);
 
 3791          w[ 1] = a(2,0)*a(3,2)-a(2,2)*a(3,0);
 
 3792          w[ 2] = a(2,0)*a(3,3)-a(2,3)*a(3,0);
 
 3793          w[ 3] = a(2,0)*a(3,4)-a(2,4)*a(3,0);
 
 3794          w[ 4] = a(2,0)*a(3,5)-a(2,5)*a(3,0);
 
 3795          w[ 5] = a(2,1)*a(3,2)-a(2,2)*a(3,1);
 
 3796          w[ 6] = a(2,1)*a(3,3)-a(2,3)*a(3,1);
 
 3797          w[ 7] = a(2,1)*a(3,4)-a(2,4)*a(3,1);
 
 3798          w[ 8] = a(2,1)*a(3,5)-a(2,5)*a(3,1);
 
 3799          w[ 9] = a(2,2)*a(3,3)-a(2,3)*a(3,2);
 
 3800          w[10] = a(2,2)*a(3,4)-a(2,4)*a(3,2);
 
 3801          w[11] = a(2,2)*a(3,5)-a(2,5)*a(3,2);
 
 3802          w[12] = a(2,3)*a(3,4)-a(2,4)*a(3,3);
 
 3803          w[13] = a(2,3)*a(3,5)-a(2,5)*a(3,3);
 
 3804          w[14] = a(2,4)*a(3,5)-a(2,5)*a(3,4);
 
 3805          w[15] = a(1,0)*w[5]-a(1,1)*w[1]+a(1,2)*w[0];
 
 3806          w[16] = a(1,0)*w[6]-a(1,1)*w[2]+a(1,3)*w[0];
 
 3807          w[17] = a(1,0)*w[7]-a(1,1)*w[3]+a(1,4)*w[0];
 
 3808          w[18] = a(1,0)*w[8]-a(1,1)*w[4]+a(1,5)*w[0];
 
 3809          w[19] = a(1,0)*w[9]-a(1,2)*w[2]+a(1,3)*w[1];
 
 3810          w[20] = a(1,0)*w[10]-a(1,2)*w[3]+a(1,4)*w[1];
 
 3811          w[21] = a(1,0)*w[11]-a(1,2)*w[4]+a(1,5)*w[1];
 
 3812          w[22] = a(1,0)*w[12]-a(1,3)*w[3]+a(1,4)*w[2];
 
 3813          w[23] = a(1,0)*w[13]-a(1,3)*w[4]+a(1,5)*w[2];
 
 3814          w[24] = a(1,0)*w[14]-a(1,4)*w[4]+a(1,5)*w[3];
 
 3815          w[25] = a(1,1)*w[9]-a(1,2)*w[6]+a(1,3)*w[5];
 
 3816          w[26] = a(1,1)*w[10]-a(1,2)*w[7]+a(1,4)*w[5];
 
 3817          w[27] = a(1,1)*w[11]-a(1,2)*w[8]+a(1,5)*w[5];
 
 3818          w[28] = a(1,1)*w[12]-a(1,3)*w[7]+a(1,4)*w[6];
 
 3819          w[29] = a(1,1)*w[13]-a(1,3)*w[8]+a(1,5)*w[6];
 
 3820          w[30] = a(1,1)*w[14]-a(1,4)*w[8]+a(1,5)*w[7];
 
 3821          w[31] = a(1,2)*w[12]-a(1,3)*w[10]+a(1,4)*w[9];
 
 3822          w[32] = a(1,2)*w[13]-a(1,3)*w[11]+a(1,5)*w[9];
 
 3823          w[33] = a(1,2)*w[14]-a(1,4)*w[11]+a(1,5)*w[10];
 
 3824          w[34] = a(1,3)*w[14]-a(1,4)*w[13]+a(1,5)*w[12];
 
 3825          w[ 0] = a(0,0)*w[25]-a(0,1)*w[19]+a(0,2)*w[16]-a(0,3)*w[15];
 
 3826          w[ 1] = a(0,0)*w[26]-a(0,1)*w[20]+a(0,2)*w[17]-a(0,4)*w[15];
 
 3827          w[ 2] = a(0,0)*w[27]-a(0,1)*w[21]+a(0,2)*w[18]-a(0,5)*w[15];
 
 3828          w[ 3] = a(0,0)*w[28]-a(0,1)*w[22]+a(0,3)*w[17]-a(0,4)*w[16];
 
 3829          w[ 4] = a(0,0)*w[29]-a(0,1)*w[23]+a(0,3)*w[18]-a(0,5)*w[16];
 
 3830          w[ 5] = a(0,0)*w[30]-a(0,1)*w[24]+a(0,4)*w[18]-a(0,5)*w[17];
 
 3831          w[ 6] = a(0,0)*w[31]-a(0,2)*w[22]+a(0,3)*w[20]-a(0,4)*w[19];
 
 3832          w[ 7] = a(0,0)*w[32]-a(0,2)*w[23]+a(0,3)*w[21]-a(0,5)*w[19];
 
 3833          w[ 8] = a(0,0)*w[33]-a(0,2)*w[24]+a(0,4)*w[21]-a(0,5)*w[20];
 
 3834          w[ 9] = a(0,0)*w[34]-a(0,3)*w[24]+a(0,4)*w[23]-a(0,5)*w[22];
 
 3835          w[10] = a(0,1)*w[31]-a(0,2)*w[28]+a(0,3)*w[26]-a(0,4)*w[25];
 
 3836          w[11] = a(0,1)*w[32]-a(0,2)*w[29]+a(0,3)*w[27]-a(0,5)*w[25];
 
 3837          w[12] = a(0,1)*w[33]-a(0,2)*w[30]+a(0,4)*w[27]-a(0,5)*w[26];
 
 3838          w[13] = a(0,1)*w[34]-a(0,3)*w[30]+a(0,4)*w[29]-a(0,5)*w[28];
 
 3839          w[14] = a(0,2)*w[34]-a(0,3)*w[33]+a(0,4)*w[32]-a(0,5)*w[31];
 
 3840          b(0,4) = d*( a(5,1)*w[14]-a(5,2)*w[13]+a(5,3)*w[12]-a(5,4)*w[11]+a(5,5)*w[10]);
 
 3841          b(1,4) = d*(-a(5,0)*w[14]+a(5,2)*w[9]-a(5,3)*w[8]+a(5,4)*w[7]-a(5,5)*w[6]);
 
 3842          b(2,4) = d*( a(5,0)*w[13]-a(5,1)*w[9]+a(5,3)*w[5]-a(5,4)*w[4]+a(5,5)*w[3]);
 
 3843          b(3,4) = d*(-a(5,0)*w[12]+a(5,1)*w[8]-a(5,2)*w[5]+a(5,4)*w[2]-a(5,5)*w[1]);
 
 3844          b(4,4) = d*( a(5,0)*w[11]-a(5,1)*w[7]+a(5,2)*w[4]-a(5,3)*w[2]+a(5,5)*w[0]);
 
 3845          b(5,4) = d*(-a(5,0)*w[10]+a(5,1)*w[6]-a(5,2)*w[3]+a(5,3)*w[1]-a(5,4)*w[0]);
 
 3846          b(0,5) = d*(-a(4,1)*w[14]+a(4,2)*w[13]-a(4,3)*w[12]+a(4,4)*w[11]-a(4,5)*w[10]);
 
 3847          b(1,5) = d*( a(4,0)*w[14]-a(4,2)*w[9]+a(4,3)*w[8]-a(4,4)*w[7]+a(4,5)*w[6]);
 
 3848          b(2,5) = d*(-a(4,0)*w[13]+a(4,1)*w[9]-a(4,3)*w[5]+a(4,4)*w[4]-a(4,5)*w[3]);
 
 3849          b(3,5) = d*( a(4,0)*w[12]-a(4,1)*w[8]+a(4,2)*w[5]-a(4,4)*w[2]+a(4,5)*w[1]);
 
 3850          b(4,5) = d*(-a(4,0)*w[11]+a(4,1)*w[7]-a(4,2)*w[4]+a(4,3)*w[2]-a(4,5)*w[0]);
 
 3851          b(5,5) = d*( a(4,0)*w[10]-a(4,1)*w[6]+a(4,2)*w[3]-a(4,3)*w[1]+a(4,4)*w[0]);
 
 3858      struct InverseHelper<n_, n_>
 
 3860        template<
typename T_, 
int smb_, 
int snb_, 
int sma_, 
int sna_>
 
 3861        CUDA_HOST_DEVICE 
static T_ compute(Tiny::Matrix<T_, n_, n_, smb_, snb_>& b, 
const Tiny::Matrix<T_, n_, n_, sma_, sna_>& a)
 
 3864          for (
int i(0); i < n_; ++i)
 
 3866            for (
int j(0); j < n_; ++j)
 
 3876          for(
int i(0); i < n_; ++i)
 
 3885          for(
int k(0); k < n_; ++k)
 
 3893              T_ pivot = CudaMath::cuda_abs(b(p[k], p[k]));
 
 3895              T_ pivot = Math::abs(b(p[k], p[k]));
 
 3900              for(
int j(k+1); j < n_; ++j)
 
 3904                T_ abs_val = CudaMath::cuda_abs(b(p[j], p[j]));
 
 3906                T_ abs_val = Math::abs(b(p[j], p[j]));
 
 3928              det *= b(p[k], p[k]);
 
 3931              const T_ pivot = T_(1) / b(p[k], p[k]);
 
 3934              b(p[k], p[k]) = T_(1);
 
 3937              for(
int j(0); j < n_; ++j)
 
 3939                b(p[k], j) *= pivot;
 
 3946            for(
int i(0); i < n_; ++i)
 
 3953              const T_ factor =  b(i, p[k]);
 
 3959              for(
int j(0); j < n_; ++j)
 
 3961                b(i, j) -= b(p[k], j) * factor;
 
 3979      struct CudaGroupedInverseHelper<1,1>
 
 3981        template<
typename T_, 
typename ThreadGroup_, 
int smb_, 
int snb_, 
int sma_, 
int sna_>
 
 3982        CUDA_DEVICE 
static __forceinline__ 
void compute(
const ThreadGroup_& tg,  Tiny::Matrix<T_, 1, 1, smb_, snb_>& b, 
const Tiny::Matrix<T_, 1, 1, sma_, sna_>&, 
const T_& det)
 
 3984          if(tg.thread_rank() == 0)
 
 3985            b(0,0) = T_(1) / det;
 
 3990      struct CudaGroupedInverseHelper<2,2>
 
 3992        template<
typename T_, 
typename ThreadGroup_, 
int smb_, 
int snb_, 
int sma_, 
int sna_>
 
 3993        CUDA_DEVICE 
static __forceinline__ 
void compute(
const ThreadGroup_& tg,  Tiny::Matrix<T_, 2, 2, smb_, snb_>& b, 
const Tiny::Matrix<T_, 2, 2, sma_, sna_>& a, 
const T_& det)
 
 3998          for(
int idx = tg.thread_rank(); idx < 4; idx += tg.num_threads())
 
 4000            const int i = idx /2;
 
 4001            const int j = idx %2;
 
 4002            b(i,0) = ((i+j)==1 ? T_(-1) : T_(1)) * d * a(1-j,1-i);
 
 4013      struct CudaGroupedInverseHelper<3,3>
 
 4015        template<
typename T_, 
typename ThreadGroup_, 
int smb_, 
int snb_, 
int sma_, 
int sna_>
 
 4016        CUDA_DEVICE 
static __forceinline__ 
void compute(
const ThreadGroup_& tg, Tiny::Matrix<T_, 3, 3, smb_, snb_>& b, 
const Tiny::Matrix<T_, 3, 3, sma_, sna_>& a, 
const T_& det)
 
 4035          for(
int idx = tg.thread_rank(); idx < 9; idx += tg.num_threads())
 
 4038              b(0,0) = d*(a(1,1)*a(2,2) - a(1,2)*a(2,1));
 
 4040              b(1,0) = d*(a(1,2)*a(2,0) - a(1,0)*a(2,2));
 
 4042              b(2,0) = d*(a(1,0)*a(2,1) - a(1,1)*a(2,0));
 
 4044              b(0,1) = d*(a(0,2)*a(2,1) - a(0,1)*a(2,2));
 
 4046              b(1,1) = d*(a(0,0)*a(2,2) - a(0,2)*a(2,0));
 
 4048              b(2,1) = d*(a(0,1)*a(2,0) - a(0,0)*a(2,1));
 
 4050              b(0,2) = d*(a(0,1)*a(1,2) - a(0,2)*a(1,1));
 
 4052              b(1,2) = d*(a(0,2)*a(1,0) - a(0,0)*a(1,2));
 
 4054              b(2,2) = d*(a(0,0)*a(1,1) - a(0,1)*a(1,0));
 
 4061      struct CudaGroupedInverseHelper<n_, n_>
 
 4063        template<
typename T_, 
typename ThreadGroup_, 
int smb_, 
int snb_, 
int sma_, 
int sna_>
 
 4064        CUDA_DEVICE 
static __forceinline__ 
void compute(
const ThreadGroup_& tg, Tiny::Matrix<T_, n_, n_, smb_, snb_>& b, 
const Tiny::Matrix<T_, n_, n_, sma_, sna_>& a, 
const T_&)
 
 4067          for (
int i = tg.thread_rank(); i < n_*n_; i += tg.num_threads())
 
 4069            const int row = i/n_;
 
 4070            const int col = i%n_;
 
 4071            b.v[row][col] = a.v[row][col];
 
 4075          __shared__ 
int p[n_];
 
 4076          for (
int i = tg.thread_rank(); i < n_; i += tg.num_threads())
 
 4082          if(tg.thread_rank() < 32)
 
 4084            auto a_g = cg::coalesced_threads();
 
 4086            CudaMath::cuda_grouped_invert_matrix(a_g, n_, snb_, &b.v[0][0], p);
 
 4099      template<
int m_, 
int n_>
 
 4100      struct CofactorHelper
 
 4102        template<
typename T_, 
int smb_, 
int snb_, 
int sma_, 
int sna_>
 
 4103        CUDA_HOST_DEVICE 
static void compute(Tiny::Matrix<T_, m_, n_, smb_, snb_>& b, 
const Tiny::Matrix<T_, m_, n_, sma_, sna_>& a)
 
 4105          static_assert(m_ == n_, 
"cofactor computation is only available for square matrices!");
 
 4108          const T_ det = Intern::InverseHelper<m_,n_>::compute(b, a);
 
 4110          for(
int i(0); i < m_; ++i)
 
 4112            for(
int j(0); j <= i; ++j)
 
 4116            for(
int j(i+1); j < n_; ++j)
 
 4118              std::swap(b(i,j), b(j,i));
 
 4126      struct CofactorHelper<1,1>
 
 4128        template<
typename T_, 
int smb_, 
int snb_, 
int sma_, 
int sna_>
 
 4129        CUDA_HOST_DEVICE 
static void compute(Tiny::Matrix<T_, 1, 1, smb_, snb_>& b, 
const Tiny::Matrix<T_, 1, 1, sma_, sna_>&)
 
 4136      struct CofactorHelper<2,2>
 
 4138        template<
typename T_, 
int smb_, 
int snb_, 
int sma_, 
int sna_>
 
 4139        CUDA_HOST_DEVICE 
static void compute(Tiny::Matrix<T_, 2, 2, smb_, snb_>& b, 
const Tiny::Matrix<T_, 2, 2, sma_, sna_>& a)
 
 4149      struct CofactorHelper<3,3>
 
 4151        template<
typename T_, 
int smb_, 
int snb_, 
int sma_, 
int sna_>
 
 4152        CUDA_HOST_DEVICE 
static void compute(Tiny::Matrix<T_, 3, 3, smb_, snb_>& b, 
const Tiny::Matrix<T_, 3, 3, sma_, sna_>& a)
 
 4154          b(0,0) = a(1,1)*a(2,2) - a(1,2)*a(2,1);
 
 4155          b(1,0) = a(1,2)*a(2,0) - a(1,0)*a(2,2);
 
 4156          b(2,0) = a(1,0)*a(2,1) - a(1,1)*a(2,0);
 
 4157          b(0,1) = a(0,2)*a(2,1) - a(0,1)*a(2,2);
 
 4158          b(1,1) = a(0,0)*a(2,2) - a(0,2)*a(2,0);
 
 4159          b(2,1) = a(0,1)*a(2,0) - a(0,0)*a(2,1);
 
 4160          b(0,2) = a(0,1)*a(1,2) - a(0,2)*a(1,1);
 
 4161          b(1,2) = a(0,2)*a(1,0) - a(0,0)*a(1,2);
 
 4162          b(2,2) = a(0,0)*a(1,1) - a(0,1)*a(1,0);
 
 4167      struct CofactorHelper<4,4>
 
 4169        template<
typename T_, 
int smb_, 
int snb_, 
int sma_, 
int sna_>
 
 4170        CUDA_HOST_DEVICE 
static void compute(Tiny::Matrix<T_, 4, 4, smb_, snb_>& b, 
const Tiny::Matrix<T_, 4, 4, sma_, sna_>& a)
 
 4173          w[0] = a(2,0)*a(3,1)-a(2,1)*a(3,0);
 
 4174          w[1] = a(2,0)*a(3,2)-a(2,2)*a(3,0);
 
 4175          w[2] = a(2,0)*a(3,3)-a(2,3)*a(3,0);
 
 4176          w[3] = a(2,1)*a(3,2)-a(2,2)*a(3,1);
 
 4177          w[4] = a(2,1)*a(3,3)-a(2,3)*a(3,1);
 
 4178          w[5] = a(2,2)*a(3,3)-a(2,3)*a(3,2);
 
 4180          b(0,0) = a(1,1)*w[5]-a(1,2)*w[4]+a(1,3)*w[3];
 
 4181          b(1,0) =-a(1,0)*w[5]+a(1,2)*w[2]-a(1,3)*w[1];
 
 4182          b(2,0) = a(1,0)*w[4]-a(1,1)*w[2]+a(1,3)*w[0];
 
 4183          b(3,0) =-a(1,0)*w[3]+a(1,1)*w[1]-a(1,2)*w[0];
 
 4185          b(0,1) =-a(0,1)*w[5]+a(0,2)*w[4]-a(0,3)*w[3];
 
 4186          b(1,1) = a(0,0)*w[5]-a(0,2)*w[2]+a(0,3)*w[1];
 
 4187          b(2,1) =-a(0,0)*w[4]+a(0,1)*w[2]-a(0,3)*w[0];
 
 4188          b(3,1) = a(0,0)*w[3]-a(0,1)*w[1]+a(0,2)*w[0];
 
 4190          w[0] = a(0,0)*a(1,1)-a(0,1)*a(1,0);
 
 4191          w[1] = a(0,0)*a(1,2)-a(0,2)*a(1,0);
 
 4192          w[2] = a(0,0)*a(1,3)-a(0,3)*a(1,0);
 
 4193          w[3] = a(0,1)*a(1,2)-a(0,2)*a(1,1);
 
 4194          w[4] = a(0,1)*a(1,3)-a(0,3)*a(1,1);
 
 4195          w[5] = a(0,2)*a(1,3)-a(0,3)*a(1,2);
 
 4197          b(0,2) = a(3,1)*w[5]-a(3,2)*w[4]+a(3,3)*w[3];
 
 4198          b(1,2) =-a(3,0)*w[5]+a(3,2)*w[2]-a(3,3)*w[1];
 
 4199          b(2,2) = a(3,0)*w[4]-a(3,1)*w[2]+a(3,3)*w[0];
 
 4200          b(3,2) =-a(3,0)*w[3]+a(3,1)*w[1]-a(3,2)*w[0];
 
 4202          b(0,3) =-a(2,1)*w[5]+a(2,2)*w[4]-a(2,3)*w[3];
 
 4203          b(1,3) = a(2,0)*w[5]-a(2,2)*w[2]+a(2,3)*w[1];
 
 4204          b(2,3) =-a(2,0)*w[4]+a(2,1)*w[2]-a(2,3)*w[0];
 
 4205          b(3,3) = a(2,0)*w[3]-a(2,1)*w[1]+a(2,2)*w[0];
 
 4210      struct CofactorHelper<5,5>
 
 4212        template<
typename T_, 
int smb_, 
int snb_, 
int sma_, 
int sna_>
 
 4213        CUDA_HOST_DEVICE 
static void compute(Tiny::Matrix<T_, 5, 5, smb_, snb_>& b, 
const Tiny::Matrix<T_, 5, 5, sma_, sna_>& a)
 
 4216          w[ 0] = a(3,0)*a(4,1)-a(3,1)*a(4,0);
 
 4217          w[ 1] = a(3,0)*a(4,2)-a(3,2)*a(4,0);
 
 4218          w[ 2] = a(3,0)*a(4,3)-a(3,3)*a(4,0);
 
 4219          w[ 3] = a(3,0)*a(4,4)-a(3,4)*a(4,0);
 
 4220          w[ 4] = a(3,1)*a(4,2)-a(3,2)*a(4,1);
 
 4221          w[ 5] = a(3,1)*a(4,3)-a(3,3)*a(4,1);
 
 4222          w[ 6] = a(3,1)*a(4,4)-a(3,4)*a(4,1);
 
 4223          w[ 7] = a(3,2)*a(4,3)-a(3,3)*a(4,2);
 
 4224          w[ 8] = a(3,2)*a(4,4)-a(3,4)*a(4,2);
 
 4225          w[ 9] = a(3,3)*a(4,4)-a(3,4)*a(4,3);
 
 4226          w[10] = a(2,0)*w[4]-a(2,1)*w[1]+a(2,2)*w[0];
 
 4227          w[11] = a(2,0)*w[5]-a(2,1)*w[2]+a(2,3)*w[0];
 
 4228          w[12] = a(2,0)*w[6]-a(2,1)*w[3]+a(2,4)*w[0];
 
 4229          w[13] = a(2,0)*w[7]-a(2,2)*w[2]+a(2,3)*w[1];
 
 4230          w[14] = a(2,0)*w[8]-a(2,2)*w[3]+a(2,4)*w[1];
 
 4231          w[15] = a(2,0)*w[9]-a(2,3)*w[3]+a(2,4)*w[2];
 
 4232          w[16] = a(2,1)*w[7]-a(2,2)*w[5]+a(2,3)*w[4];
 
 4233          w[17] = a(2,1)*w[8]-a(2,2)*w[6]+a(2,4)*w[4];
 
 4234          w[18] = a(2,1)*w[9]-a(2,3)*w[6]+a(2,4)*w[5];
 
 4235          w[19] = a(2,2)*w[9]-a(2,3)*w[8]+a(2,4)*w[7];
 
 4237          b(0,0) = a(1,1)*w[19]-a(1,2)*w[18]+a(1,3)*w[17]-a(1,4)*w[16];
 
 4238          b(1,0) =-a(1,0)*w[19]+a(1,2)*w[15]-a(1,3)*w[14]+a(1,4)*w[13];
 
 4239          b(2,0) = a(1,0)*w[18]-a(1,1)*w[15]+a(1,3)*w[12]-a(1,4)*w[11];
 
 4240          b(3,0) =-a(1,0)*w[17]+a(1,1)*w[14]-a(1,2)*w[12]+a(1,4)*w[10];
 
 4241          b(4,0) = a(1,0)*w[16]-a(1,1)*w[13]+a(1,2)*w[11]-a(1,3)*w[10];
 
 4243          b(0,1) =-a(0,1)*w[19]+a(0,2)*w[18]-a(0,3)*w[17]+a(0,4)*w[16];
 
 4244          b(1,1) = a(0,0)*w[19]-a(0,2)*w[15]+a(0,3)*w[14]-a(0,4)*w[13];
 
 4245          b(2,1) =-a(0,0)*w[18]+a(0,1)*w[15]-a(0,3)*w[12]+a(0,4)*w[11];
 
 4246          b(3,1) = a(0,0)*w[17]-a(0,1)*w[14]+a(0,2)*w[12]-a(0,4)*w[10];
 
 4247          b(4,1) =-a(0,0)*w[16]+a(0,1)*w[13]-a(0,2)*w[11]+a(0,3)*w[10];
 
 4249          w[10] = a(1,0)*w[4]-a(1,1)*w[1]+a(1,2)*w[0];
 
 4250          w[11] = a(1,0)*w[5]-a(1,1)*w[2]+a(1,3)*w[0];
 
 4251          w[12] = a(1,0)*w[6]-a(1,1)*w[3]+a(1,4)*w[0];
 
 4252          w[13] = a(1,0)*w[7]-a(1,2)*w[2]+a(1,3)*w[1];
 
 4253          w[14] = a(1,0)*w[8]-a(1,2)*w[3]+a(1,4)*w[1];
 
 4254          w[15] = a(1,0)*w[9]-a(1,3)*w[3]+a(1,4)*w[2];
 
 4255          w[16] = a(1,1)*w[7]-a(1,2)*w[5]+a(1,3)*w[4];
 
 4256          w[17] = a(1,1)*w[8]-a(1,2)*w[6]+a(1,4)*w[4];
 
 4257          w[18] = a(1,1)*w[9]-a(1,3)*w[6]+a(1,4)*w[5];
 
 4258          w[19] = a(1,2)*w[9]-a(1,3)*w[8]+a(1,4)*w[7];
 
 4260          b(0,2) = a(0,1)*w[19]-a(0,2)*w[18]+a(0,3)*w[17]-a(0,4)*w[16];
 
 4261          b(1,2) =-a(0,0)*w[19]+a(0,2)*w[15]-a(0,3)*w[14]+a(0,4)*w[13];
 
 4262          b(2,2) = a(0,0)*w[18]-a(0,1)*w[15]+a(0,3)*w[12]-a(0,4)*w[11];
 
 4263          b(3,2) =-a(0,0)*w[17]+a(0,1)*w[14]-a(0,2)*w[12]+a(0,4)*w[10];
 
 4264          b(4,2) = a(0,0)*w[16]-a(0,1)*w[13]+a(0,2)*w[11]-a(0,3)*w[10];
 
 4266          w[ 0] = a(0,0)*a(1,1)-a(0,1)*a(1,0);
 
 4267          w[ 1] = a(0,0)*a(1,2)-a(0,2)*a(1,0);
 
 4268          w[ 2] = a(0,0)*a(1,3)-a(0,3)*a(1,0);
 
 4269          w[ 3] = a(0,0)*a(1,4)-a(0,4)*a(1,0);
 
 4270          w[ 4] = a(0,1)*a(1,2)-a(0,2)*a(1,1);
 
 4271          w[ 5] = a(0,1)*a(1,3)-a(0,3)*a(1,1);
 
 4272          w[ 6] = a(0,1)*a(1,4)-a(0,4)*a(1,1);
 
 4273          w[ 7] = a(0,2)*a(1,3)-a(0,3)*a(1,2);
 
 4274          w[ 8] = a(0,2)*a(1,4)-a(0,4)*a(1,2);
 
 4275          w[ 9] = a(0,3)*a(1,4)-a(0,4)*a(1,3);
 
 4276          w[10] = a(2,0)*w[4]-a(2,1)*w[1]+a(2,2)*w[0];
 
 4277          w[11] = a(2,0)*w[5]-a(2,1)*w[2]+a(2,3)*w[0];
 
 4278          w[12] = a(2,0)*w[6]-a(2,1)*w[3]+a(2,4)*w[0];
 
 4279          w[13] = a(2,0)*w[7]-a(2,2)*w[2]+a(2,3)*w[1];
 
 4280          w[14] = a(2,0)*w[8]-a(2,2)*w[3]+a(2,4)*w[1];
 
 4281          w[15] = a(2,0)*w[9]-a(2,3)*w[3]+a(2,4)*w[2];
 
 4282          w[16] = a(2,1)*w[7]-a(2,2)*w[5]+a(2,3)*w[4];
 
 4283          w[17] = a(2,1)*w[8]-a(2,2)*w[6]+a(2,4)*w[4];
 
 4284          w[18] = a(2,1)*w[9]-a(2,3)*w[6]+a(2,4)*w[5];
 
 4285          w[19] = a(2,2)*w[9]-a(2,3)*w[8]+a(2,4)*w[7];
 
 4287          b(0,3) =  a(4,1)*w[19]-a(4,2)*w[18]+a(4,3)*w[17]-a(4,4)*w[16];
 
 4288          b(1,3) = -a(4,0)*w[19]+a(4,2)*w[15]-a(4,3)*w[14]+a(4,4)*w[13];
 
 4289          b(2,3) =  a(4,0)*w[18]-a(4,1)*w[15]+a(4,3)*w[12]-a(4,4)*w[11];
 
 4290          b(3,3) = -a(4,0)*w[17]+a(4,1)*w[14]-a(4,2)*w[12]+a(4,4)*w[10];
 
 4291          b(4,3) =  a(4,0)*w[16]-a(4,1)*w[13]+a(4,2)*w[11]-a(4,3)*w[10];
 
 4293          b(0,4) = -a(3,1)*w[19]+a(3,2)*w[18]-a(3,3)*w[17]+a(3,4)*w[16];
 
 4294          b(1,4) =  a(3,0)*w[19]-a(3,2)*w[15]+a(3,3)*w[14]-a(3,4)*w[13];
 
 4295          b(2,4) = -a(3,0)*w[18]+a(3,1)*w[15]-a(3,3)*w[12]+a(3,4)*w[11];
 
 4296          b(3,4) =  a(3,0)*w[17]-a(3,1)*w[14]+a(3,2)*w[12]-a(3,4)*w[10];
 
 4297          b(4,4) = -a(3,0)*w[16]+a(3,1)*w[13]-a(3,2)*w[11]+a(3,3)*w[10];
 
 4302      struct CofactorHelper<6,6>
 
 4304        template<
typename T_, 
int smb_, 
int snb_, 
int sma_, 
int sna_>
 
 4305        CUDA_HOST_DEVICE 
static void compute(Tiny::Matrix<T_, 6, 6, smb_, snb_>& b, 
const Tiny::Matrix<T_, 6, 6, sma_, sna_>& a)
 
 4308          w[ 0] = a(4,0)*a(5,1)-a(4,1)*a(5,0);
 
 4309          w[ 1] = a(4,0)*a(5,2)-a(4,2)*a(5,0);
 
 4310          w[ 2] = a(4,0)*a(5,3)-a(4,3)*a(5,0);
 
 4311          w[ 3] = a(4,0)*a(5,4)-a(4,4)*a(5,0);
 
 4312          w[ 4] = a(4,0)*a(5,5)-a(4,5)*a(5,0);
 
 4313          w[ 5] = a(4,1)*a(5,2)-a(4,2)*a(5,1);
 
 4314          w[ 6] = a(4,1)*a(5,3)-a(4,3)*a(5,1);
 
 4315          w[ 7] = a(4,1)*a(5,4)-a(4,4)*a(5,1);
 
 4316          w[ 8] = a(4,1)*a(5,5)-a(4,5)*a(5,1);
 
 4317          w[ 9] = a(4,2)*a(5,3)-a(4,3)*a(5,2);
 
 4318          w[10] = a(4,2)*a(5,4)-a(4,4)*a(5,2);
 
 4319          w[11] = a(4,2)*a(5,5)-a(4,5)*a(5,2);
 
 4320          w[12] = a(4,3)*a(5,4)-a(4,4)*a(5,3);
 
 4321          w[13] = a(4,3)*a(5,5)-a(4,5)*a(5,3);
 
 4322          w[14] = a(4,4)*a(5,5)-a(4,5)*a(5,4);
 
 4323          w[15] = a(3,0)*w[5]-a(3,1)*w[1]+a(3,2)*w[0];
 
 4324          w[16] = a(3,0)*w[6]-a(3,1)*w[2]+a(3,3)*w[0];
 
 4325          w[17] = a(3,0)*w[7]-a(3,1)*w[3]+a(3,4)*w[0];
 
 4326          w[18] = a(3,0)*w[8]-a(3,1)*w[4]+a(3,5)*w[0];
 
 4327          w[19] = a(3,0)*w[9]-a(3,2)*w[2]+a(3,3)*w[1];
 
 4328          w[20] = a(3,0)*w[10]-a(3,2)*w[3]+a(3,4)*w[1];
 
 4329          w[21] = a(3,0)*w[11]-a(3,2)*w[4]+a(3,5)*w[1];
 
 4330          w[22] = a(3,0)*w[12]-a(3,3)*w[3]+a(3,4)*w[2];
 
 4331          w[23] = a(3,0)*w[13]-a(3,3)*w[4]+a(3,5)*w[2];
 
 4332          w[24] = a(3,0)*w[14]-a(3,4)*w[4]+a(3,5)*w[3];
 
 4333          w[25] = a(3,1)*w[9]-a(3,2)*w[6]+a(3,3)*w[5];
 
 4334          w[26] = a(3,1)*w[10]-a(3,2)*w[7]+a(3,4)*w[5];
 
 4335          w[27] = a(3,1)*w[11]-a(3,2)*w[8]+a(3,5)*w[5];
 
 4336          w[28] = a(3,1)*w[12]-a(3,3)*w[7]+a(3,4)*w[6];
 
 4337          w[29] = a(3,1)*w[13]-a(3,3)*w[8]+a(3,5)*w[6];
 
 4338          w[30] = a(3,1)*w[14]-a(3,4)*w[8]+a(3,5)*w[7];
 
 4339          w[31] = a(3,2)*w[12]-a(3,3)*w[10]+a(3,4)*w[9];
 
 4340          w[32] = a(3,2)*w[13]-a(3,3)*w[11]+a(3,5)*w[9];
 
 4341          w[33] = a(3,2)*w[14]-a(3,4)*w[11]+a(3,5)*w[10];
 
 4342          w[34] = a(3,3)*w[14]-a(3,4)*w[13]+a(3,5)*w[12];
 
 4344          w[ 0] = a(2,0)*w[25]-a(2,1)*w[19]+a(2,2)*w[16]-a(2,3)*w[15];
 
 4345          w[ 1] = a(2,0)*w[26]-a(2,1)*w[20]+a(2,2)*w[17]-a(2,4)*w[15];
 
 4346          w[ 2] = a(2,0)*w[27]-a(2,1)*w[21]+a(2,2)*w[18]-a(2,5)*w[15];
 
 4347          w[ 3] = a(2,0)*w[28]-a(2,1)*w[22]+a(2,3)*w[17]-a(2,4)*w[16];
 
 4348          w[ 4] = a(2,0)*w[29]-a(2,1)*w[23]+a(2,3)*w[18]-a(2,5)*w[16];
 
 4349          w[ 5] = a(2,0)*w[30]-a(2,1)*w[24]+a(2,4)*w[18]-a(2,5)*w[17];
 
 4350          w[ 6] = a(2,0)*w[31]-a(2,2)*w[22]+a(2,3)*w[20]-a(2,4)*w[19];
 
 4351          w[ 7] = a(2,0)*w[32]-a(2,2)*w[23]+a(2,3)*w[21]-a(2,5)*w[19];
 
 4352          w[ 8] = a(2,0)*w[33]-a(2,2)*w[24]+a(2,4)*w[21]-a(2,5)*w[20];
 
 4353          w[ 9] = a(2,0)*w[34]-a(2,3)*w[24]+a(2,4)*w[23]-a(2,5)*w[22];
 
 4354          w[10] = a(2,1)*w[31]-a(2,2)*w[28]+a(2,3)*w[26]-a(2,4)*w[25];
 
 4355          w[11] = a(2,1)*w[32]-a(2,2)*w[29]+a(2,3)*w[27]-a(2,5)*w[25];
 
 4356          w[12] = a(2,1)*w[33]-a(2,2)*w[30]+a(2,4)*w[27]-a(2,5)*w[26];
 
 4357          w[13] = a(2,1)*w[34]-a(2,3)*w[30]+a(2,4)*w[29]-a(2,5)*w[28];
 
 4358          w[14] = a(2,2)*w[34]-a(2,3)*w[33]+a(2,4)*w[32]-a(2,5)*w[31];
 
 4360          b(0,0) =  a(1,1)*w[14]-a(1,2)*w[13]+a(1,3)*w[12]-a(1,4)*w[11]+a(1,5)*w[10];
 
 4361          b(1,0) = -a(1,0)*w[14]+a(1,2)*w[9]-a(1,3)*w[8]+a(1,4)*w[7]-a(1,5)*w[6];
 
 4362          b(2,0) =  a(1,0)*w[13]-a(1,1)*w[9]+a(1,3)*w[5]-a(1,4)*w[4]+a(1,5)*w[3];
 
 4363          b(3,0) = -a(1,0)*w[12]+a(1,1)*w[8]-a(1,2)*w[5]+a(1,4)*w[2]-a(1,5)*w[1];
 
 4364          b(4,0) =  a(1,0)*w[11]-a(1,1)*w[7]+a(1,2)*w[4]-a(1,3)*w[2]+a(1,5)*w[0];
 
 4365          b(5,0) = -a(1,0)*w[10]+a(1,1)*w[6]-a(1,2)*w[3]+a(1,3)*w[1]-a(1,4)*w[0];
 
 4367          b(0,1) =-a(0,1)*w[14]+a(0,2)*w[13]-a(0,3)*w[12]+a(0,4)*w[11]-a(0,5)*w[10];
 
 4368          b(1,1) = a(0,0)*w[14]-a(0,2)*w[9]+a(0,3)*w[8]-a(0,4)*w[7]+a(0,5)*w[6];
 
 4369          b(2,1) =-a(0,0)*w[13]+a(0,1)*w[9]-a(0,3)*w[5]+a(0,4)*w[4]-a(0,5)*w[3];
 
 4370          b(3,1) = a(0,0)*w[12]-a(0,1)*w[8]+a(0,2)*w[5]-a(0,4)*w[2]+a(0,5)*w[1];
 
 4371          b(4,1) =-a(0,0)*w[11]+a(0,1)*w[7]-a(0,2)*w[4]+a(0,3)*w[2]-a(0,5)*w[0];
 
 4372          b(5,1) = a(0,0)*w[10]-a(0,1)*w[6]+a(0,2)*w[3]-a(0,3)*w[1]+a(0,4)*w[0];
 
 4374          w[ 0] = a(4,0)*a(5,1)-a(4,1)*a(5,0);
 
 4375          w[ 1] = a(4,0)*a(5,2)-a(4,2)*a(5,0);
 
 4376          w[ 2] = a(4,0)*a(5,3)-a(4,3)*a(5,0);
 
 4377          w[ 3] = a(4,0)*a(5,4)-a(4,4)*a(5,0);
 
 4378          w[ 4] = a(4,0)*a(5,5)-a(4,5)*a(5,0);
 
 4379          w[ 5] = a(4,1)*a(5,2)-a(4,2)*a(5,1);
 
 4380          w[ 6] = a(4,1)*a(5,3)-a(4,3)*a(5,1);
 
 4381          w[ 7] = a(4,1)*a(5,4)-a(4,4)*a(5,1);
 
 4382          w[ 8] = a(4,1)*a(5,5)-a(4,5)*a(5,1);
 
 4383          w[ 9] = a(4,2)*a(5,3)-a(4,3)*a(5,2);
 
 4384          w[10] = a(4,2)*a(5,4)-a(4,4)*a(5,2);
 
 4385          w[11] = a(4,2)*a(5,5)-a(4,5)*a(5,2);
 
 4386          w[12] = a(4,3)*a(5,4)-a(4,4)*a(5,3);
 
 4387          w[13] = a(4,3)*a(5,5)-a(4,5)*a(5,3);
 
 4388          w[14] = a(4,4)*a(5,5)-a(4,5)*a(5,4);
 
 4389          w[15] = a(1,0)*w[5]-a(1,1)*w[1]+a(1,2)*w[0];
 
 4390          w[16] = a(1,0)*w[6]-a(1,1)*w[2]+a(1,3)*w[0];
 
 4391          w[17] = a(1,0)*w[7]-a(1,1)*w[3]+a(1,4)*w[0];
 
 4392          w[18] = a(1,0)*w[8]-a(1,1)*w[4]+a(1,5)*w[0];
 
 4393          w[19] = a(1,0)*w[9]-a(1,2)*w[2]+a(1,3)*w[1];
 
 4394          w[20] = a(1,0)*w[10]-a(1,2)*w[3]+a(1,4)*w[1];
 
 4395          w[21] = a(1,0)*w[11]-a(1,2)*w[4]+a(1,5)*w[1];
 
 4396          w[22] = a(1,0)*w[12]-a(1,3)*w[3]+a(1,4)*w[2];
 
 4397          w[23] = a(1,0)*w[13]-a(1,3)*w[4]+a(1,5)*w[2];
 
 4398          w[24] = a(1,0)*w[14]-a(1,4)*w[4]+a(1,5)*w[3];
 
 4399          w[25] = a(1,1)*w[9]-a(1,2)*w[6]+a(1,3)*w[5];
 
 4400          w[26] = a(1,1)*w[10]-a(1,2)*w[7]+a(1,4)*w[5];
 
 4401          w[27] = a(1,1)*w[11]-a(1,2)*w[8]+a(1,5)*w[5];
 
 4402          w[28] = a(1,1)*w[12]-a(1,3)*w[7]+a(1,4)*w[6];
 
 4403          w[29] = a(1,1)*w[13]-a(1,3)*w[8]+a(1,5)*w[6];
 
 4404          w[30] = a(1,1)*w[14]-a(1,4)*w[8]+a(1,5)*w[7];
 
 4405          w[31] = a(1,2)*w[12]-a(1,3)*w[10]+a(1,4)*w[9];
 
 4406          w[32] = a(1,2)*w[13]-a(1,3)*w[11]+a(1,5)*w[9];
 
 4407          w[33] = a(1,2)*w[14]-a(1,4)*w[11]+a(1,5)*w[10];
 
 4408          w[34] = a(1,3)*w[14]-a(1,4)*w[13]+a(1,5)*w[12];
 
 4409          w[ 0] = a(0,0)*w[25]-a(0,1)*w[19]+a(0,2)*w[16]-a(0,3)*w[15];
 
 4410          w[ 1] = a(0,0)*w[26]-a(0,1)*w[20]+a(0,2)*w[17]-a(0,4)*w[15];
 
 4411          w[ 2] = a(0,0)*w[27]-a(0,1)*w[21]+a(0,2)*w[18]-a(0,5)*w[15];
 
 4412          w[ 3] = a(0,0)*w[28]-a(0,1)*w[22]+a(0,3)*w[17]-a(0,4)*w[16];
 
 4413          w[ 4] = a(0,0)*w[29]-a(0,1)*w[23]+a(0,3)*w[18]-a(0,5)*w[16];
 
 4414          w[ 5] = a(0,0)*w[30]-a(0,1)*w[24]+a(0,4)*w[18]-a(0,5)*w[17];
 
 4415          w[ 6] = a(0,0)*w[31]-a(0,2)*w[22]+a(0,3)*w[20]-a(0,4)*w[19];
 
 4416          w[ 7] = a(0,0)*w[32]-a(0,2)*w[23]+a(0,3)*w[21]-a(0,5)*w[19];
 
 4417          w[ 8] = a(0,0)*w[33]-a(0,2)*w[24]+a(0,4)*w[21]-a(0,5)*w[20];
 
 4418          w[ 9] = a(0,0)*w[34]-a(0,3)*w[24]+a(0,4)*w[23]-a(0,5)*w[22];
 
 4419          w[10] = a(0,1)*w[31]-a(0,2)*w[28]+a(0,3)*w[26]-a(0,4)*w[25];
 
 4420          w[11] = a(0,1)*w[32]-a(0,2)*w[29]+a(0,3)*w[27]-a(0,5)*w[25];
 
 4421          w[12] = a(0,1)*w[33]-a(0,2)*w[30]+a(0,4)*w[27]-a(0,5)*w[26];
 
 4422          w[13] = a(0,1)*w[34]-a(0,3)*w[30]+a(0,4)*w[29]-a(0,5)*w[28];
 
 4423          w[14] = a(0,2)*w[34]-a(0,3)*w[33]+a(0,4)*w[32]-a(0,5)*w[31];
 
 4425          b(0,2) = a(3,1)*w[14]-a(3,2)*w[13]+a(3,3)*w[12]-a(3,4)*w[11]+a(3,5)*w[10];
 
 4426          b(1,2) =-a(3,0)*w[14]+a(3,2)*w[9]-a(3,3)*w[8]+a(3,4)*w[7]-a(3,5)*w[6];
 
 4427          b(2,2) = a(3,0)*w[13]-a(3,1)*w[9]+a(3,3)*w[5]-a(3,4)*w[4]+a(3,5)*w[3];
 
 4428          b(3,2) =-a(3,0)*w[12]+a(3,1)*w[8]-a(3,2)*w[5]+a(3,4)*w[2]-a(3,5)*w[1];
 
 4429          b(4,2) = a(3,0)*w[11]-a(3,1)*w[7]+a(3,2)*w[4]-a(3,3)*w[2]+a(3,5)*w[0];
 
 4430          b(5,2) =-a(3,0)*w[10]+a(3,1)*w[6]-a(3,2)*w[3]+a(3,3)*w[1]-a(3,4)*w[0];
 
 4432          b(0,3) =-a(2,1)*w[14]+a(2,2)*w[13]-a(2,3)*w[12]+a(2,4)*w[11]-a(2,5)*w[10];
 
 4433          b(1,3) = a(2,0)*w[14]-a(2,2)*w[9]+a(2,3)*w[8]-a(2,4)*w[7]+a(2,5)*w[6];
 
 4434          b(2,3) =-a(2,0)*w[13]+a(2,1)*w[9]-a(2,3)*w[5]+a(2,4)*w[4]-a(2,5)*w[3];
 
 4435          b(3,3) = a(2,0)*w[12]-a(2,1)*w[8]+a(2,2)*w[5]-a(2,4)*w[2]+a(2,5)*w[1];
 
 4436          b(4,3) =-a(2,0)*w[11]+a(2,1)*w[7]-a(2,2)*w[4]+a(2,3)*w[2]-a(2,5)*w[0];
 
 4437          b(5,3) = a(2,0)*w[10]-a(2,1)*w[6]+a(2,2)*w[3]-a(2,3)*w[1]+a(2,4)*w[0];
 
 4439          w[ 0] = a(2,0)*a(3,1)-a(2,1)*a(3,0);
 
 4440          w[ 1] = a(2,0)*a(3,2)-a(2,2)*a(3,0);
 
 4441          w[ 2] = a(2,0)*a(3,3)-a(2,3)*a(3,0);
 
 4442          w[ 3] = a(2,0)*a(3,4)-a(2,4)*a(3,0);
 
 4443          w[ 4] = a(2,0)*a(3,5)-a(2,5)*a(3,0);
 
 4444          w[ 5] = a(2,1)*a(3,2)-a(2,2)*a(3,1);
 
 4445          w[ 6] = a(2,1)*a(3,3)-a(2,3)*a(3,1);
 
 4446          w[ 7] = a(2,1)*a(3,4)-a(2,4)*a(3,1);
 
 4447          w[ 8] = a(2,1)*a(3,5)-a(2,5)*a(3,1);
 
 4448          w[ 9] = a(2,2)*a(3,3)-a(2,3)*a(3,2);
 
 4449          w[10] = a(2,2)*a(3,4)-a(2,4)*a(3,2);
 
 4450          w[11] = a(2,2)*a(3,5)-a(2,5)*a(3,2);
 
 4451          w[12] = a(2,3)*a(3,4)-a(2,4)*a(3,3);
 
 4452          w[13] = a(2,3)*a(3,5)-a(2,5)*a(3,3);
 
 4453          w[14] = a(2,4)*a(3,5)-a(2,5)*a(3,4);
 
 4454          w[15] = a(1,0)*w[5]-a(1,1)*w[1]+a(1,2)*w[0];
 
 4455          w[16] = a(1,0)*w[6]-a(1,1)*w[2]+a(1,3)*w[0];
 
 4456          w[17] = a(1,0)*w[7]-a(1,1)*w[3]+a(1,4)*w[0];
 
 4457          w[18] = a(1,0)*w[8]-a(1,1)*w[4]+a(1,5)*w[0];
 
 4458          w[19] = a(1,0)*w[9]-a(1,2)*w[2]+a(1,3)*w[1];
 
 4459          w[20] = a(1,0)*w[10]-a(1,2)*w[3]+a(1,4)*w[1];
 
 4460          w[21] = a(1,0)*w[11]-a(1,2)*w[4]+a(1,5)*w[1];
 
 4461          w[22] = a(1,0)*w[12]-a(1,3)*w[3]+a(1,4)*w[2];
 
 4462          w[23] = a(1,0)*w[13]-a(1,3)*w[4]+a(1,5)*w[2];
 
 4463          w[24] = a(1,0)*w[14]-a(1,4)*w[4]+a(1,5)*w[3];
 
 4464          w[25] = a(1,1)*w[9]-a(1,2)*w[6]+a(1,3)*w[5];
 
 4465          w[26] = a(1,1)*w[10]-a(1,2)*w[7]+a(1,4)*w[5];
 
 4466          w[27] = a(1,1)*w[11]-a(1,2)*w[8]+a(1,5)*w[5];
 
 4467          w[28] = a(1,1)*w[12]-a(1,3)*w[7]+a(1,4)*w[6];
 
 4468          w[29] = a(1,1)*w[13]-a(1,3)*w[8]+a(1,5)*w[6];
 
 4469          w[30] = a(1,1)*w[14]-a(1,4)*w[8]+a(1,5)*w[7];
 
 4470          w[31] = a(1,2)*w[12]-a(1,3)*w[10]+a(1,4)*w[9];
 
 4471          w[32] = a(1,2)*w[13]-a(1,3)*w[11]+a(1,5)*w[9];
 
 4472          w[33] = a(1,2)*w[14]-a(1,4)*w[11]+a(1,5)*w[10];
 
 4473          w[34] = a(1,3)*w[14]-a(1,4)*w[13]+a(1,5)*w[12];
 
 4475          w[ 0] = a(0,0)*w[25]-a(0,1)*w[19]+a(0,2)*w[16]-a(0,3)*w[15];
 
 4476          w[ 1] = a(0,0)*w[26]-a(0,1)*w[20]+a(0,2)*w[17]-a(0,4)*w[15];
 
 4477          w[ 2] = a(0,0)*w[27]-a(0,1)*w[21]+a(0,2)*w[18]-a(0,5)*w[15];
 
 4478          w[ 3] = a(0,0)*w[28]-a(0,1)*w[22]+a(0,3)*w[17]-a(0,4)*w[16];
 
 4479          w[ 4] = a(0,0)*w[29]-a(0,1)*w[23]+a(0,3)*w[18]-a(0,5)*w[16];
 
 4480          w[ 5] = a(0,0)*w[30]-a(0,1)*w[24]+a(0,4)*w[18]-a(0,5)*w[17];
 
 4481          w[ 6] = a(0,0)*w[31]-a(0,2)*w[22]+a(0,3)*w[20]-a(0,4)*w[19];
 
 4482          w[ 7] = a(0,0)*w[32]-a(0,2)*w[23]+a(0,3)*w[21]-a(0,5)*w[19];
 
 4483          w[ 8] = a(0,0)*w[33]-a(0,2)*w[24]+a(0,4)*w[21]-a(0,5)*w[20];
 
 4484          w[ 9] = a(0,0)*w[34]-a(0,3)*w[24]+a(0,4)*w[23]-a(0,5)*w[22];
 
 4485          w[10] = a(0,1)*w[31]-a(0,2)*w[28]+a(0,3)*w[26]-a(0,4)*w[25];
 
 4486          w[11] = a(0,1)*w[32]-a(0,2)*w[29]+a(0,3)*w[27]-a(0,5)*w[25];
 
 4487          w[12] = a(0,1)*w[33]-a(0,2)*w[30]+a(0,4)*w[27]-a(0,5)*w[26];
 
 4488          w[13] = a(0,1)*w[34]-a(0,3)*w[30]+a(0,4)*w[29]-a(0,5)*w[28];
 
 4489          w[14] = a(0,2)*w[34]-a(0,3)*w[33]+a(0,4)*w[32]-a(0,5)*w[31];
 
 4491          b(0,4) = a(5,1)*w[14]-a(5,2)*w[13]+a(5,3)*w[12]-a(5,4)*w[11]+a(5,5)*w[10];
 
 4492          b(1,4) =-a(5,0)*w[14]+a(5,2)*w[9]-a(5,3)*w[8]+a(5,4)*w[7]-a(5,5)*w[6];
 
 4493          b(2,4) = a(5,0)*w[13]-a(5,1)*w[9]+a(5,3)*w[5]-a(5,4)*w[4]+a(5,5)*w[3];
 
 4494          b(3,4) =-a(5,0)*w[12]+a(5,1)*w[8]-a(5,2)*w[5]+a(5,4)*w[2]-a(5,5)*w[1];
 
 4495          b(4,4) = a(5,0)*w[11]-a(5,1)*w[7]+a(5,2)*w[4]-a(5,3)*w[2]+a(5,5)*w[0];
 
 4496          b(5,4) =-a(5,0)*w[10]+a(5,1)*w[6]-a(5,2)*w[3]+a(5,3)*w[1]-a(5,4)*w[0];
 
 4498          b(0,5) =-a(4,1)*w[14]+a(4,2)*w[13]-a(4,3)*w[12]+a(4,4)*w[11]-a(4,5)*w[10];
 
 4499          b(1,5) = a(4,0)*w[14]-a(4,2)*w[9]+a(4,3)*w[8]-a(4,4)*w[7]+a(4,5)*w[6];
 
 4500          b(2,5) =-a(4,0)*w[13]+a(4,1)*w[9]-a(4,3)*w[5]+a(4,4)*w[4]-a(4,5)*w[3];
 
 4501          b(3,5) = a(4,0)*w[12]-a(4,1)*w[8]+a(4,2)*w[5]-a(4,4)*w[2]+a(4,5)*w[1];
 
 4502          b(4,5) =-a(4,0)*w[11]+a(4,1)*w[7]-a(4,2)*w[4]+a(4,3)*w[2]-a(4,5)*w[0];
 
 4503          b(5,5) = a(4,0)*w[10]-a(4,1)*w[6]+a(4,2)*w[3]-a(4,3)*w[1]+a(4,4)*w[0];
 
#define XABORTM(msg)
Abortion macro definition with custom message.
#define ASSERTM(expr, msg)
Debug-Assertion macro definition with custom message.
#define XASSERTM(expr, msg)
Assertion macro definition with custom message.
Math Limits class template.
Tiny Matrix class template.
CUDA_HOST_DEVICE DataType det() const
Returns the determinant of the matrix.
CUDA_HOST_DEVICE Matrix(DataType value)
value-assignment constructor
CUDA_HOST_DEVICE Matrix & set_cofactor(const Matrix< T_, m_, n_, sma_, sna_ > &a)
Sets this matrix to the cofactor matrix of another matrix.
CUDA_HOST_DEVICE Matrix & add_scalar_main_diag(DataType alpha)
Adds a value onto the matrix's main diagonal.
CUDA_HOST_DEVICE Matrix & add_outer_product(const Vector< T_, m_, snx_ > &x, const Vector< T_, n_, sny_ > &y, const DataType alpha=DataType(1))
Adds the outer product of two vectors onto the matrix.
CUDA_HOST_DEVICE Matrix & add_mat_mat_mult(const Matrix< T_, m_, la_, sma_, sna_ > &a, const Matrix< T_, lb_, n_, smb_, snb_ > &b, DataType alpha=DataType(1))
Adds the algebraic matrix-product of two other matrices onto this matrix.
CUDA_HOST_DEVICE Matrix & set_mat_mat_mult(const Matrix< T_, m_, la_, sma_, sna_ > &a, const Matrix< T_, lb_, n_, smb_, snb_ > &b)
Sets this matrix to the algebraic matrix-product of two other matrices.
CUDA_HOST_DEVICE DataType norm_frobenius_sqr() const
Returns the Frobenius norm squared of the matrix.
T_ ValueType
the data type of the matrix
CUDA_HOST_DEVICE Matrix & operator=(const Matrix< T_, m_, n_, sma_, sna_ > &a)
assignment operator
CUDA_HOST_DEVICE Matrix(const std::initializer_list< std::initializer_list< Tx_ > > &x)
Initializer list constructor.
CUDA_HOST_DEVICE Matrix & add_vec_tensor_mult(const Vector< T_, l_, snv_ > &x, const Tensor3< T_, l_, m_, n_, slt_, smt_, snt_ > &t, DataType alpha=DataType(1))
Adds the result of a vector-tensor left-product onto this matrix.
CUDA_HOST_DEVICE Matrix & set_transpose(const Matrix< T_, n_, m_, sma_, sna_ > &a)
Sets this matrix to the transpose of another matrix.
CUDA_HOST_DEVICE Matrix & axpy(DataType alpha, const Matrix< T_, m_, n_, sma_, sna_ > &a)
Adds another scaled matrix onto this matrix.
CUDA_HOST_DEVICE Matrix & operator-=(const Matrix< T_, m_, n_, sma_, sna_ > &a)
matrix component-wise subtraction operator
CUDA_HOST_DEVICE Matrix(const Matrix< T2_, m_, n_, sma_, sna_ > &a)
copy constructor
static CUDA_HOST_DEVICE Matrix null()
Returns a null-matrix.
CUDA_HOST_DEVICE DataType norm_sub_id_frobenius() const
Returns the Frobenius norm of the difference of this matrix and the identity matrix.
CUDA_HOST_DEVICE Matrix & set_inverse(const Matrix< T_, m_, n_, sma_, sna_ > &a)
Sets this matrix to the inverse of another matrix.
CUDA_HOST_DEVICE const RowType & operator[](int i) const
Row-Access operator.
CUDA_HOST_DEVICE Matrix & operator*=(DataType alpha)
scalar-right-multiply-by operator
CUDA_HOST_DEVICE DataType scalar_product(const Vector< T_, m_, snx_ > &x, const Vector< T_, n_, sny_ > &y) const
Computes the scalar product of two vectors with this matrix.
Intern::DataTypeExtractor< ValueType >::MyDataType DataType
The basic data type buried in the lowest level of the vector.
Vector< T_, n_, sn_ > RowType
the type of a single matrix row
CUDA_HOST_DEVICE const T_ & operator()(int i, int j) const
Access operator.
CUDA_HOST_DEVICE Matrix & operator=(DataType value)
value-assignment operator
CUDA_HOST_DEVICE Matrix & set_rotation_3d(T_ yaw, T_ pitch, T_ roll)
Sets this matrix to a 3D yaw-pitch-roll rotation matrix.
CUDA_HOST_DEVICE DataType trace() const
Returns the trace of the matrix.
CUDA_HOST_DEVICE DataType norm_hessian_sqr() const
Returns the Hessian norm square.
CUDA_HOST_DEVICE Matrix(const std::initializer_list< Tx_ > &x)
Initializer list of Tiny::Vector constructor.
CUDA_HOST_DEVICE Matrix & set_gram(const Matrix< T_, l_, n_, sla_, sna_ > &a)
Sets this matrix to the Gram matrix of another matrix.
CUDA_HOST_DEVICE Matrix & set_rotation_2d(T_ angle)
Sets this matrix to a 2D rotation matrix.
CUDA_HOST_DEVICE Matrix & set_double_mat_mult(const Matrix< T_, k_, l_, sma_, sna_ > &a, const Matrix< T_, k_, m_, smb_, snb_ > &b, const Matrix< T_, l_, n_, smd_, snd_ > &d, T_ alpha=T_(1))
Sets this matrix to the algebraic matrix double-product of three other matrices.
CUDA_HOST_DEVICE T_ & operator()(int i, int j)
Access operator.
CUDA_HOST_DEVICE Matrix & add_double_mat_mult(const Matrix< T_, k_, l_, sma_, sna_ > &a, const Matrix< T_, k_, m_, smb_, snb_ > &b, const Matrix< T_, l_, n_, smd_, snd_ > &d, DataType alpha=DataType(1))
Adds the algebraic matrix double-product of three other matrices onto this matrix.
CUDA_HOST_DEVICE void convert(const Matrix< Tx_, m_, n_, sma_, sna_ > &a)
conversion operator
CUDA_HOST_DEVICE Matrix & set_identity()
Sets this matrix to the identity matrix.
CUDA_HOST_DEVICE Matrix & operator+=(const Matrix< T_, m_, n_, sma_, sna_ > &a)
matrix component-wise addition operator
CUDA_HOST_DEVICE RowType & operator[](int i)
Row-Access operator.
CUDA_HOST_DEVICE Matrix & operator=(const std::initializer_list< std::initializer_list< Tx_ > > &x)
Initializer list assignment operator.
CUDA_HOST_DEVICE Matrix()
default constructor
CUDA_HOST_DEVICE void copy_n(const Matrix< T_, ma_, na_, sma_, sna_ > &a)
Copies the upper left mm_ x nn_ entries of a matrix.
CUDA_HOST_DEVICE DataType vol() const
Returns the volume of the matrix.
RowType v[sm_]
actual matrix data; that's an array of vectors
CUDA_HOST_DEVICE Matrix & operator=(const std::initializer_list< Tx_ > &x)
Initializer list assignment operator.
CUDA_HOST_DEVICE Matrix & set_vec_tensor_mult(const Vector< T_, l_, snv_ > &x, const Tensor3< T_, l_, m_, n_, slt_, smt_, snt_ > &t, DataType alpha=DataType(1))
Sets this matrix to the result of a vector-tensor left-product.
CUDA_HOST_DEVICE Matrix & set_outer_product(const Vector< T_, m_, snx_ > &x, const Vector< T_, n_, sny_ > &y)
Sets this matrix to the outer product of two vectors.
CUDA_HOST_DEVICE void format(DataType alpha=DataType(0))
Formats the matrix.
CUDA_HOST_DEVICE void copy(const Matrix< T_, m_, n_, sma_, sna_ > &a)
Copies a matrix.
CUDA_HOST_DEVICE DataType norm_frobenius() const
Returns the Frobenius norm of the matrix.
Tiny Tensor3 class template.
CUDA_HOST_DEVICE Tensor3 & add_double_mat_mult(const Tensor3< T_, lt_, mt_, nt_, slt_, smt_, snt_ > &t, const Matrix< T_, nt_, n_, smb_, snb_ > &b, const Matrix< T_, mt_, m_, smd_, snd_ > &d, DataType alpha=DataType(1))
Adds the result of a matrix-tensor-matrix double-product onto this tensor.
Intern::DataTypeExtractor< ValueType >::MyDataType DataType
The basic data type buried in the lowest level of the vector.
CUDA_HOST_DEVICE void copy_n(const Tensor3< T_, la_, ma_, na_, sla_, sma_, sna_ > &a)
Copies the upper left mm_ x nn_ entries of a matrix.
static constexpr int l
the tube count of the tensor
CUDA_HOST_DEVICE Tensor3 & operator=(const std::initializer_list< Tx_ > &x)
Initializer list assignment operator.
static constexpr int sn
the column stride of the tensor
CUDA_HOST_DEVICE Tensor3 & operator=(const Tensor3< T_, l_, m_, n_, sla_, sma_, sna_ > &a)
copy-assignment operator
CUDA_HOST_DEVICE void format(DataType alpha=DataType(0))
formats the tensor
CUDA_HOST_DEVICE const T_ & operator()(int h, int i, int j) const
Access operator.
CUDA_HOST_DEVICE Tensor3 & operator=(DataType value)
value-assignment operator
CUDA_HOST_DEVICE Tensor3 & operator=(const std::initializer_list< std::initializer_list< std::initializer_list< Tx_ > > > &x)
Initializer list assignment operator.
CUDA_HOST_DEVICE Tensor3 & add_vec_mat_outer_product(const Vector< T_, l_, slx_ > &x, const Matrix< T_, m_, n_, sma_, sna_ > &a, DataType alpha=DataType(1))
Adds the result of a vector-matrix outer product onto this tensor.
CUDA_HOST_DEVICE Tensor3 & operator+=(const Tensor3< T_, l_, m_, n_, sla_, sma_, sna_ > &a)
tensor component-wise addition operator
CUDA_HOST_DEVICE Tensor3(const std::initializer_list< std::initializer_list< std::initializer_list< Tx_ > > > &x)
Initializer list constructor.
CUDA_HOST_DEVICE const PlaneType & operator[](int h) const
Plane-Access operator.
CUDA_HOST_DEVICE Tensor3(const Tensor3< T_, l_, m_, n_, sla_, sma_, sna_ > &a)
copy-constructor
Matrix< T_, m_, n_, sm_, sn_ > PlaneType
Type of tensor data; that's an array of matrices.
static constexpr int sl
the tube stride of the tensor
CUDA_HOST_DEVICE Tensor3(const std::initializer_list< Tx_ > &x)
Initializer list constructor.
CUDA_HOST_DEVICE PlaneType & operator[](int h)
Plane-Access operator.
CUDA_HOST_DEVICE Tensor3(DataType value)
value-assignment constructor
static CUDA_HOST_DEVICE Tensor3 null()
Returns a null-tensor.
CUDA_HOST_DEVICE Tensor3 & add_mat_tensor_mult(const Matrix< T_, l_, k_, sma_, sna_ > &a, const Tensor3< T_, k_, m_, n_, slt_, smt_, snt_ > &t, DataType alpha=DataType(1))
Adds the result of a matrix-tensor product onto this tensor.
PlaneType v[sl_]
Actual tensor data.
static constexpr int m
the row count of the tensor
CUDA_HOST_DEVICE void convert(const Tensor3< Tx_, l_, m_, n_, sla_, sma_, sna_ > &a)
conversion operator
CUDA_HOST_DEVICE Tensor3()
default constructor
CUDA_HOST_DEVICE Tensor3 & operator-=(const Tensor3< T_, l_, m_, n_, sla_, sma_, sna_ > &a)
tensor component-wise subtraction operator
CUDA_HOST_DEVICE T_ & operator()(int h, int i, int j)
Access operator.
static constexpr int n
the column count of the tensor
static constexpr int sm
the row stride of the tensor
T_ ValueType
the data type of the tensor
CUDA_HOST_DEVICE Tensor3 & operator*=(DataType alpha)
scalar right-multiply-by operator
CUDA_HOST_DEVICE void copy(const Tensor3< T_, l_, m_, n_, sla_, sma_, sna_ > &a)
Copies a tensor3.
Tiny Vector class template.
static CUDA_HOST_DEVICE Vector null()
Returns a null-vector.
CUDA_HOST_DEVICE DataType norm_euclid_n() const
Computes the euclid norm of first nn_ entries of this vector.
CUDA_HOST_DEVICE Vector & add_mat_vec_mult(const Matrix< T_, n_, m_, sma_, sna_ > &a, const Vector< T_, m_, sx_ > &x, DataType alpha=DataType(1))
Adds the result of a matrix-vector product onto this vector.
CUDA_HOST_DEVICE DataType norm_euclid_sqr_n() const
Computes the squared euclid norm of first nn_ entries of this vector.
CUDA_HOST_DEVICE Vector & normalize_n()
Normalizes the first nn_ entries of this vector.
CUDA_HOST_DEVICE Vector & operator=(const Vector< T_, n_, sx_ > &x)
copy-assignment operator
T_ v[s_]
actual vector data
CUDA_HOST_DEVICE bool normalized() const
Return if the vector is normalized.
CUDA_HOST_DEVICE void format_n(DataType alpha=DataType(0))
Formats the first nn_ entries of the vector.
static CUDA_HOST_DEVICE Vector convert_new(Vector &&x)
overload for moveable rvalue type
CUDA_HOST_DEVICE void format(DataType alpha=DataType(0))
Formats the vector.
CUDA_HOST_DEVICE Vector(const Vector< T_, n_, sx_ > &x)
copy constructor
CUDA_HOST_DEVICE Vector & operator=(const std::initializer_list< Tx_ > &x)
Initializer list assignment operator.
CUDA_HOST_DEVICE DataType norm_max() const
Computes the max-norm of this vector.
CUDA_HOST_DEVICE T_ & operator()(int i)
Access operator.
CUDA_HOST_DEVICE const T_ & operator[](int i) const
Access operator.
CUDA_HOST_DEVICE Vector & operator-=(const Vector< T_, n_, sx_ > &x)
vector-subtract operator
CUDA_HOST_DEVICE Vector & set_vec_mat_mult(const Vector< T_, m_, sx_ > &x, const Matrix< T_, m_, n_, sma_, sna_ > &a)
Sets this vector to the result of a vector-matrix product.
static CUDA_HOST_DEVICE Vector convert_new(const Vector< Tx_, n_, sx_ > &x)
convert function, not callable with non convertable inner type
CUDA_HOST_DEVICE Vector & add_vec_mat_mult_n(const Vector< T_, mx_, smx_ > &x, const Matrix< T_, ma_, na_, sma_, sna_ > &a, DataType alpha=DataType(1))
Adds the result of a vector-matrix product onto this vector.
CUDA_HOST_DEVICE void scale_n(DataType alpha)
Scales the first nn_ entries of the vector.
CUDA_HOST_DEVICE Vector & set_mat_vec_mult_n(const Matrix< T_, ma_, na_, sma_, sna_ > &a, const Vector< T_, nx_, sx_ > &x)
Sets the first nn_ entries of this vector to the result of a matrix-vector product with the first mm_...
static constexpr int n
the length of the vector
CUDA_HOST_DEVICE Vector & operator*=(DataType alpha)
scalar-multiply operator
CUDA_HOST_DEVICE Vector()
default constructor
CUDA_HOST_DEVICE DataType norm_l1() const
Computes the l1-norm of this vector.
CUDA_HOST_DEVICE Vector & set_convex(DataType alpha, const Vector< T_, n_, sna_ > &a, const Vector< T_, n_, snb_ > &b)
Sets this vector to the convex combination of two other vectors.
CUDA_HOST_DEVICE Vector & axpy_n(DataType alpha, const Vector< T_, nx_, snx_ > &x)
Adds the first nn_ entries of another scaled vector onto this vector.
CUDA_HOST_DEVICE Vector & add_mat_vec_mult_n(const Matrix< T_, ma_, na_, sma_, sna_ > &a, const Vector< T_, nx_, sx_ > &x, DataType alpha=DataType(1))
Adds the result of a matrix-vector product onto this vector.
CUDA_HOST friend std::ostream & operator<<(std::ostream &lhs, const Vector &b)
Tiny::Vector streaming operator.
CUDA_HOST_DEVICE Vector & operator+=(const Vector< T_, n_, sx_ > &x)
vector-add operator
CUDA_HOST_DEVICE Vector & operator=(DataType value)
value-assignment operator
CUDA_HOST_DEVICE const T_ & operator()(int i) const
Access operator.
CUDA_HOST_DEVICE Vector & axpy(DataType alpha, const Vector< T_, n_, snx_ > &x)
Adds another scaled vector onto this vector.
CUDA_HOST_DEVICE void copy(const Vector< T_, n_, snx_ > &x)
Copies a vector.
CUDA_HOST_DEVICE Vector & normalize()
Normalizes this vector.
CUDA_HOST_DEVICE Vector & negate()
Negates the vector, i.e. effectively multiplies all components by -1.
Intern::DataTypeExtractor< ValueType >::MyDataType DataType
The basic data type buried in the lowest level of the vector.
CUDA_HOST_DEVICE Vector(DataType value)
value-assignment constructor
CUDA_HOST_DEVICE DataType norm_max_n() const
Computes the max-norm of the vector.
CUDA_HOST_DEVICE Vector & negate_n()
Negates the first nn_ entries of this vector.
CUDA_HOST_DEVICE DataType norm_l1_n() const
Computes the l1-norm of the first nn_ entries of this vector.
CUDA_HOST_DEVICE Vector & set_convex_n(DataType alpha, const Vector< T_, na_, sna_ > &a, const Vector< T_, nb_, snb_ > &b)
Sets the first nn_ entries of this vector to the convex combination of two other vectors.
CUDA_HOST_DEVICE DataType norm_euclid_sqr() const
Computes the squared euclid norm of this vector.
CUDA_HOST_DEVICE void copy_n(const Vector< T_, nx_, snx_ > &x)
Copies the first nn_ entries of a vector.
CUDA_HOST_DEVICE void scale(DataType alpha)
Scales the vector.
CUDA_HOST_DEVICE Vector(const std::initializer_list< Tx_ > &x)
Initializer list constructor.
static constexpr int s
the stride of the vector
CUDA_HOST_DEVICE Vector & set_mat_vec_mult(const Matrix< T_, n_, m_, sma_, sna_ > &a, const Vector< T_, m_, sx_ > &x)
Sets this vector to the result of a matrix-vector product.
CUDA_HOST_DEVICE Vector & add_vec_mat_mult(const Vector< T_, m_, sx_ > &x, const Matrix< T_, m_, n_, sma_, sna_ > &a, DataType alpha=DataType(1))
Adds the result of a vector-matrix product onto this vector.
CUDA_HOST_DEVICE T_ & operator[](int i)
Access operator.
T_ ValueType
the value type of the vector
CUDA_HOST_DEVICE DataType norm_euclid() const
Computes the euclid norm of this vector.
CUDA_HOST_DEVICE Vector & set_vec_mat_mult_n(const Vector< T_, mx_, smx_ > &x, const Matrix< T_, ma_, na_, sma_, sna_ > &a)
Sets the first mm_ entries of this vector to the result of a vector-matrix product with the first mm_...
CUDA_HOST_DEVICE void convert(const Vector< Tx_, n_, sx_ > &x)
conversion operator
CUDA_HOST_DEVICE Vector & operator*=(const Vector< T_, n_, sx_ > &x)
element-wise-multiply operator
T_ sqrt(T_ x)
Returns the square-root of a value.
T_ abs(T_ x)
Returns the absolute value.
DT_ calc_opening_angle(DT_ x1, DT_ x2, DT_ y1, DT_ y2)
Calculates the opening angle of two 2D vectors.
T_ pow(T_ x, T_ y)
Returns x raised to the power of y.
T_ sin(T_ x)
Returns the sine of a value.
T_ sqr(T_ x)
Returns the square of a value.
T_ min(T_ a, T_ b)
Returns the minimum of two values.
T_ cos(T_ x)
Returns the cosine of a value.
T_ max(T_ a, T_ b)
Returns the maximum of two values.
CUDA_HOST_DEVICE Vector< T_, n_ > operator+(const Vector< T_, n_, sa_ > &a, const Vector< T_, n_, sb_ > &b)
vector addition operator
CUDA_HOST Vector< T_, dim_ > project_onto(const Vector< T_, dim_ > &x, const Vector< T_, dim_ > &y)
Calculates the projected vector.
CUDA_HOST_DEVICE T_ dot(const T_ &a, const T_ &b)
Computes the dot-product of two scalars.
CUDA_HOST_DEVICE void orthogonal_3x2(Vector< T_, mx_, smx_ > &nu, const Matrix< T_, ma_, na_, sm_, sn_ > &tau)
Computes the positively oriented orthogonal vector to the columns of a 3x2 matrix.
CUDA_HOST_DEVICE void axpy(T_ &y, const T_ &x, const T_ &alpha)
Performs an AXPY of two scalars.
CUDA_HOST_DEVICE void orthogonal_2x1(Vector< T_, mx_, smx_ > &nu, const Matrix< T_, ma_, na_, sm_, sn_ > &tau)
Computes the positively oriented orthogonal vector to the columns of a 2x1 matrix.
CUDA_HOST_DEVICE void add_id(T_ &x, const T_ &alpha)
Adds a scaled identity onto a scalar.
Vector< T_, m_ > orthogonal(const Matrix< T_, m_, m_-1, sm_, sn_ > &tau)
Computes the positively oriented orthogonal vector to the columns of a m_ x (m_-1) Matrix.
CUDA_HOST_DEVICE Vector< T_, n_ > operator-(const Vector< T_, n_, sa_ > &a, const Vector< T_, n_, sb_ > &b)
vector subtraction operator
CUDA_HOST_DEVICE Vector< T_, n_ > component_product(const Vector< T_, n_, sa_ > &a, const Vector< T_, n_, sb_ > &b)
vector element-wise-product operator
CUDA_HOST_DEVICE Vector< T_, n_ > operator*(typename Vector< T_, n_ >::DataType alpha, const Vector< T_, n_, s_ > &x)
scalar-left-multiplication operator
CUDA_HOST T_ calculate_opening_angle(const Vector< T_, 2 > &x, const Vector< T_, 2 > &y)
Calculates the counter-clockwise opening angle between two 2D vectors.
String stringify(const T_ &item)
Converts an item into a String.
@ value
specifies whether the space should supply basis function values