FEAT 3
Finite Element Analysis Toolbox
Loading...
Searching...
No Matches
tiny_algebra.hpp
1// FEAT3: Finite Element Analysis Toolbox, Version 3
2// Copyright (C) 2010 by Stefan Turek & the FEAT group
3// FEAT3 is released under the GNU General Public License version 3,
4// see the file 'copyright.txt' in the top level directory for details.
5
6#pragma once
7
10#ifndef __CUDA_ARCH__
11#include <kernel/util/math.hpp>
12#endif
13
14// includes, system
15#ifndef __CUDACC__
16#include <initializer_list>
17#else
18#include <kernel/util/cuda_math.cuh>
19#endif
20
21namespace FEAT
22{
29 namespace Tiny
30 {
48 template<
49 typename T_,
50 int n_,
51 int s_ = n_>
52 class Vector DOXY({});
53
74 template<
75 typename T_,
76 int m_,
77 int n_,
78 int sm_ = m_,
79 int sn_ = n_>
80 class Matrix DOXY({});
81
108 template<
109 typename T_,
110 int l_,
111 int m_,
112 int n_,
113 int sl_ = l_,
114 int sm_ = m_,
115 int sn_ = n_>
116 class Tensor3 DOXY({});
117
119 namespace Intern
120 {
129 template<typename T_>
130 struct DataTypeExtractor
131 {
133 typedef T_ MyDataType;
135 static constexpr int level = 0;
136 };
137
152 template<typename T_, int n_, int s_>
153 struct DataTypeExtractor<Vector<T_, n_, s_>>
154 {
156 typedef typename DataTypeExtractor<T_>::MyDataType MyDataType;
158 static constexpr int level = DataTypeExtractor<T_>::level+1;
159 };
160
161 // Same for Matrix
162 template<typename T_, int m_, int n_, int sm_, int sn_>
163 struct DataTypeExtractor<Matrix<T_, m_, n_, sm_, sn_>>
164 {
166 typedef typename DataTypeExtractor<T_>::MyDataType MyDataType;
168 static constexpr int level = DataTypeExtractor<T_>::level+1;
169 };
170
171 // Same for Tensor3
172 template<typename T_, int l_, int m_, int n_, int sl_, int sm_, int sn_>
173 struct DataTypeExtractor<Tensor3<T_, l_, m_, n_, sl_, sm_, sn_>>
174 {
176 typedef typename DataTypeExtractor<T_>::MyDataType MyDataType;
178 static constexpr int level = DataTypeExtractor<T_>::level+1;
179 };
180
181 // forward declarations of helper classes
182 template<int m_, int n_>
183 struct DetHelper;
184
185 template<int m_, int n_>
186 struct VolHelper;
187
188 template<int m_, int n_>
189 struct InverseHelper;
190
191 template<int m_, int n_>
192 struct CofactorHelper;
193
194#ifdef __CUDACC__
195 template<int m_, int n_>
196 struct CudaGroupedInverseHelper;
197#endif
198 } // namespace Intern
200
201 /* ************************************************************************************************************* */
202 /* ************************************************************************************************************* */
203 // Tiny Vector implementation
204 /* ************************************************************************************************************* */
205 /* ************************************************************************************************************* */
206
207 template<typename T_, int n_, int s_>
208 class Vector
209 {
210 static_assert(n_ > 0, "invalid vector length");
211 static_assert(s_ >= n_, "invalid vector stride");
212
213 public:
215 static constexpr int n = n_;
217 static constexpr int s = s_;
218
220 typedef T_ ValueType;
222 typedef typename Intern::DataTypeExtractor<ValueType>::MyDataType DataType;
223
225 T_ v[s_];
226
228 CUDA_HOST_DEVICE Vector()
229 {
230 }
231
233 CUDA_HOST_DEVICE explicit Vector(DataType value)
234 {
235 for(int i(0); i < n_; ++i)
236 {
237 v[i] = value;
238 }
239 }
240
242 template<int sx_>
243 CUDA_HOST_DEVICE Vector(const Vector<T_, n_, sx_>& x)
244 {
245 for(int i(0); i < n_; ++i)
246 {
247 v[i] = x.v[i];
248 }
249 }
250
252 template<typename Tx_, int sx_>
253 CUDA_HOST_DEVICE static Vector convert_new(const Vector<Tx_, n_, sx_>& x)
254 {
255 Vector v;
256 for(int i(0); i < n_; ++i)
257 {
258 if constexpr(std::is_same<T_, DataType>::value)
259 v[i] = T_(x.v[i]);
260 else
261 v[i] = T_::convert(x.v[i]);
262 }
263 return v;
264 }
265
267 CUDA_HOST_DEVICE static Vector convert_new(Vector&& x)
268 {
269 return std::move(x);
270 }
271
284 template<typename Tx_>
285 CUDA_HOST_DEVICE explicit Vector(const std::initializer_list<Tx_>& x)
286 {
287 XASSERTM(std::size_t(n_) == x.size(), "invalid initializer list size");
288 auto it(x.begin());
289 for(int i(0); i < n_; ++i, ++it)
290 v[i] = T_(*it);
291 }
292
294 CUDA_HOST_DEVICE Vector& operator=(DataType value)
295 {
296 for(int i(0); i < n_; ++i)
297 {
298 v[i] = value;
299 }
300 return *this;
301 }
302
304 template<int sx_>
305 CUDA_HOST_DEVICE Vector& operator=(const Vector<T_, n_, sx_>& x)
306 {
307 for(int i(0); i < n_; ++i)
308 {
309 v[i] = x.v[i];
310 }
311 return *this;
312 }
313
328 template<typename Tx_>
329 CUDA_HOST_DEVICE Vector& operator=(const std::initializer_list<Tx_>& x)
330 {
331 XASSERTM(std::size_t(n_) == x.size(), "invalid initializer list size");
332 auto it(x.begin());
333 for(int i(0); i < n_; ++i, ++it)
334 v[i] = T_(*it);
335 return *this;
336 }
337
339 template<typename Tx_, int sx_>
340 CUDA_HOST_DEVICE void convert(const Vector<Tx_, n_, sx_>& x)
341 {
342 for(int i(0); i < n_; ++i)
343 v[i] = T_(x.v[i]);
344 }
345
354 CUDA_HOST_DEVICE T_& operator()(int i)
355 {
356 ASSERTM((i >= 0) && (i < n_), "index i out-of-bounds");
357 return v[i];
358 }
359
361 CUDA_HOST_DEVICE const T_& operator()(int i) const
362 {
363 ASSERTM((i >= 0) && (i < n_), "index i out-of-bounds");
364 return v[i];
365 }
366
368 CUDA_HOST_DEVICE T_& operator[](int i)
369 {
370 ASSERTM((i >= 0) && (i < n_), "index i out-of-bounds");
371 return v[i];
372 }
373
375 CUDA_HOST_DEVICE const T_& operator[](int i) const
376 {
377 ASSERTM((i >= 0) && (i < n_), "index i out-of-bounds");
378 return v[i];
379 }
380
387 template<int snx_>
388 CUDA_HOST_DEVICE void copy(const Vector<T_, n_, snx_>& x)
389 {
390 for(int i(0); i < n_; ++i)
391 {
392 v[i] = x.v[i];
393 }
394 }
395
405 template<int nn_, int nx_, int snx_>
406 CUDA_HOST_DEVICE void copy_n(const Vector<T_, nx_, snx_>& x)
407 {
408 static_assert(nn_ <= n_, "invalid copy_n size");
409 static_assert(nn_ <= nx_, "invalid copy_n size");
410 for(int i(0); i < nn_; ++i)
411 {
412 v[i] = x.v[i];
413 }
414 }
415
417 CUDA_HOST_DEVICE Vector& operator*=(DataType alpha)
418 {
419 for(int i(0); i < n_; ++i)
420 {
421 v[i] *= alpha;
422 }
423 return *this;
424 }
425
427 template <int sx_>
428 CUDA_HOST_DEVICE Vector& operator*=(const Vector<T_, n_, sx_>& x)
429 {
430 for(int i(0); i < n_; ++i)
431 {
432 v[i] *= x.v[i];
433 }
434 return *this;
435 }
436
438 template<int sx_>
439 CUDA_HOST_DEVICE Vector& operator+=(const Vector<T_, n_, sx_>& x)
440 {
441 for(int i(0); i < n_; ++i)
442 {
443 v[i] += x.v[i];
444 }
445 return *this;
446 }
447
449 template<int sx_>
450 CUDA_HOST_DEVICE Vector& operator-=(const Vector<T_, n_, sx_>& x)
451 {
452 for(int i(0); i < n_; ++i)
453 {
454 v[i] -= x.v[i];
455 }
456 return *this;
457 }
458
465 CUDA_HOST_DEVICE void format(DataType alpha = DataType(0))
466 {
467 for(int i(0); i < n_; ++i)
468 {
469 v[i] = alpha;
470 }
471 }
472
482 template<int nn_>
483 CUDA_HOST_DEVICE void format_n(DataType alpha = DataType(0))
484 {
485 static_assert(nn_ <= n_, "invalid format_n size");
486 for(int i(0); i < nn_; ++i)
487 {
488 v[i] = alpha;
489 }
490 }
491
498 CUDA_HOST_DEVICE void scale(DataType alpha)
499 {
500 for(int i(0); i < n_; ++i)
501 {
502 v[i] *= alpha;
503 }
504 }
505
515 template<int nn_>
516 CUDA_HOST_DEVICE void scale_n(DataType alpha)
517 {
518 static_assert(nn_ <= n_, "invalid scale_n size");
519 for(int i(0); i < nn_; ++i)
520 {
521 v[i] *= alpha;
522 }
523 }
524
530 CUDA_HOST_DEVICE Vector& normalize()
531 {
532 #ifndef __CUDACC__
533 const DataType norm2(this->norm_euclid());
534 ASSERTM(norm2 > Math::eps<DataType>(), "Trying to normalize a null vector!");
535 return ((*this) *= (DataType(1)/norm2));
536 #else
537 const DataType norm2_sqr(this->norm_euclid_sqr());
538 ASSERTM(norm2_sqr > CudaMath::cuda_get_eps<DataType>(), "Trying to normalize a null vector!");
539 return ((*this) *= CudaMath::cuda_rsqrt(norm2_sqr));
540 #endif
541 }
542
551 template<int nn_>
552 CUDA_HOST_DEVICE Vector& normalize_n()
553 {
554 static_assert(nn_ <= n_, "invalid normalize_n size");
555#ifndef __CUDACC__
556 const DataType norm2(this->template norm_euclid_n<nn_>());
557 ASSERTM(norm2 > Math::eps<DataType>(), "Trying to normalize a null vector!");
558 this->template scale_n<nn_>(DataType(1)/norm2);
559 return *this;
560#else
561 const DataType norm2_sqr(this->template norm_euclid_sqr_n<nn_>());
562 ASSERTM(norm2_sqr > CudaMath::cuda_get_eps<DataType>(), "Trying to normalize a null vector!");
563 this->template scale_n<nn_>(CudaMath::cuda_rsqrt(norm2_sqr));
564 return *this;
565#endif
566 }
567
573 CUDA_HOST_DEVICE Vector& negate()
574 {
575 for(int i(0); i < n_; ++i)
576 v[i] = -v[i];
577 return *this;
578 }
579
588 template<int nn_>
589 CUDA_HOST_DEVICE Vector& negate_n()
590 {
591 static_assert(nn_ <= n_, "invalid negate_n size");
592 for(int i(0); i < nn_; ++i)
593 v[i] = -v[i];
594 return *this;
595 }
596
608 template<int snx_>
609 CUDA_HOST_DEVICE Vector& axpy(DataType alpha, const Vector<T_, n_, snx_>& x)
610 {
611 for(int i(0); i < n_; ++i)
612 v[i] += alpha * x.v[i];
613 return *this;
614 }
615
630 template<int nn_, int nx_, int snx_>
631 CUDA_HOST_DEVICE Vector& axpy_n(DataType alpha, const Vector<T_, nx_, snx_>& x)
632 {
633 static_assert(nn_ <= n_, "invalid negate_n size");
634 static_assert(nn_ <= nx_, "invalid negate_n size");
635 for(int i(0); i < nn_; ++i)
636 v[i] += alpha * x.v[i];
637 return *this;
638 }
639
657 template<int sna_, int snb_>
658 CUDA_HOST_DEVICE Vector& set_convex(DataType alpha, const Vector<T_, n_, sna_>& a, const Vector<T_, n_, snb_>& b)
659 {
660 for(int i(0); i < n_; ++i)
661 v[i] = (T_(1) - alpha) * a.v[i] + alpha * b.v[i];
662 return *this;
663 }
664
685 template<int nn_, int na_, int nb_, int sna_, int snb_>
686 CUDA_HOST_DEVICE Vector& set_convex_n(DataType alpha, const Vector<T_, na_, sna_>& a, const Vector<T_, nb_, snb_>& b)
687 {
688 static_assert(nn_ <= n_, "invalid set_convex_n size");
689 static_assert(nn_ <= na_, "invalid set_convex_n size");
690 static_assert(nn_ <= nb_, "invalid set_convex_n size");
691 for(int i(0); i < nn_; ++i)
692 v[i] = (T_(1) - alpha) * a.v[i] + alpha * b.v[i];
693 return *this;
694 }
695
711 template<int m_, int sma_, int sna_, int sx_>
713 {
714 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
715 ASSERTM((const void*)this != (const void*)&x, "result vector and multiplicand vector 'x' must be different objects");
716
717 for(int i(0); i < n_; ++i)
718 {
719 v[i] = T_(0);
720 for(int j(0); j < m_; ++j)
721 {
722 v[i] += a.v[i][j] * x.v[j];
723 }
724 }
725 return *this;
726 }
727
747 template<int mm_, int nn_, int ma_, int na_, int sna_, int sma_, int nx_, int sx_>
749 {
750 static_assert(mm_ <= n_, "invalid set_mat_vec_mult_n size");
751 static_assert(mm_ <= ma_, "invalid set_mat_vec_mult_n size");
752 static_assert(nn_ <= nx_, "invalid set_mat_vec_mult_n size");
753 static_assert(nn_ <= na_, "invalid set_mat_vec_mult_n size");
754
755 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
756 ASSERTM((const void*)this != (const void*)&x, "result vector and multiplicand vector 'x' must be different objects");
757
758 for(int i(0); i < mm_; ++i)
759 {
760 v[i] = T_(0);
761 for(int j(0); j < nn_; ++j)
762 {
763 v[i] += a.v[i][j] * x.v[j];
764 }
765 }
766 return *this;
767 }
768
784 template<int m_, int sma_, int sna_, int sx_>
786 {
787 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
788 ASSERTM((const void*)this != (const void*)&x, "result vector and multiplicand vector 'x' must be different objects");
789
790 for(int j(0); j < n_; ++j)
791 {
792 v[j] = T_(0);
793 for(int i(0); i < m_; ++i)
794 {
795 v[j] += a.v[i][j] * x.v[i];
796 }
797 }
798 return *this;
799 }
800
820 template<int nn_, int mm_, int mx_, int smx_, int ma_, int na_, int sma_, int sna_>
822 {
823 static_assert(mm_ <= mx_, "invalid set_mat_vec_mult_n size");
824 static_assert(mm_ <= ma_, "invalid set_mat_vec_mult_n size");
825 static_assert(nn_ <= n_, "invalid set_mat_vec_mult_n size");
826 static_assert(nn_ <= na_, "invalid set_mat_vec_mult_n size");
827
828 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
829 ASSERTM((const void*)this != (const void*)&x, "result vector and multiplicand vector 'x' must be different objects");
830
831 for(int j(0); j < nn_; ++j)
832 {
833 v[j] = T_(0);
834 for(int i(0); i < mm_; ++i)
835 {
836 v[j] += a.v[i][j] * x.v[i];
837 }
838 }
839 return *this;
840 }
841
860 template<int m_, int sma_, int sna_, int sx_>
862 {
863 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
864 ASSERTM((const void*)this != (const void*)&x, "result vector and multiplicand vector 'x' must be different objects");
865
866 for(int i(0); i < n_; ++i)
867 {
868 for(int j(0); j < m_; ++j)
869 {
870 v[i] += alpha * a.v[i][j] * x.v[j];
871 }
872 }
873 return *this;
874 }
875
900 template<int mm_, int nn_, int ma_, int na_, int sna_, int sma_, int nx_, int sx_>
902 {
903 static_assert(mm_ <= n_, "invalid add_mat_vec_mult_n size");
904 static_assert(mm_ <= ma_, "invalid add_mat_vec_mult_n size");
905 static_assert(nn_ <= nx_, "invalid add_mat_vec_mult_n size");
906 static_assert(nn_ <= na_, "invalid add_mat_vec_mult_n size");
907
908 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
909 ASSERTM((const void*)this != (const void*)&x, "result vector and multiplicand vector 'x' must be different objects");
910
911 for(int i(0); i < mm_; ++i)
912 {
913 for(int j(0); j < nn_; ++j)
914 {
915 v[i] += alpha * a.v[i][j] * x.v[j];
916 }
917 }
918 return *this;
919 }
920
939 template<int m_, int sma_, int sna_, int sx_>
941 {
942 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
943 ASSERTM((const void*)this != (const void*)&x, "result vector and multiplicand vector 'x' must be different objects");
944
945 for(int j(0); j < n_; ++j)
946 {
947 for(int i(0); i < m_; ++i)
948 {
949 v[j] += alpha * a.v[i][j] * x.v[i];
950 }
951 }
952 return *this;
953 }
954
979 template<int nn_, int mm_, int mx_, int smx_, int ma_, int na_, int sma_, int sna_>
981 {
982 static_assert(mm_ <= mx_, "invalid add_vec_mat_mult_n size");
983 static_assert(mm_ <= ma_, "invalid add_vec_mat_mult_n size");
984 static_assert(nn_ <= n_, "invalid add_vec_mat_mult_n size");
985 static_assert(nn_ <= na_, "invalid add_vec_mat_mult_n size");
986
987 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
988 ASSERTM((const void*)this != (const void*)&x, "result vector and multiplicand vector 'x' must be different objects");
989
990 for(int j(0); j < nn_; ++j)
991 {
992 for(int i(0); i < mm_; ++i)
993 {
994 v[j] += alpha * a.v[i][j] * x.v[i];
995 }
996 }
997 return *this;
998 }
999
1006 CUDA_HOST_DEVICE DataType norm_euclid_sqr() const
1007 {
1008 DataType r(DataType(0));
1009 for(int i(0); i < n_; ++i)
1010 {
1011 #ifndef __CUDACC__
1012 r += Math::sqr(v[i]);
1013 #else
1014 r += CudaMath::cuda_sqr(v[i]);
1015 #endif
1016 }
1017 return r;
1018 }
1019
1026 template<int nn_>
1027 CUDA_HOST_DEVICE DataType norm_euclid_sqr_n() const
1028 {
1029 static_assert(nn_ <= n_, "invalid norm_euclid_sqr_n size");
1030 DataType r(DataType(0));
1031 for(int i(0); i < nn_; ++i)
1032 {
1033 #ifndef __CUDACC__
1034 r += Math::sqr(v[i]);
1035 #else
1036 r += CudaMath::cuda_sqr(v[i]);
1037 #endif
1038 }
1039 return r;
1040 }
1041
1048 CUDA_HOST_DEVICE DataType norm_euclid() const
1049 {
1050 #ifndef __CUDACC__
1051 return Math::sqrt(norm_euclid_sqr());
1052 #else
1053 return CudaMath::cuda_sqrt(norm_euclid_sqr());
1054 #endif
1055 }
1056
1063 template<int nn_>
1064 CUDA_HOST_DEVICE DataType norm_euclid_n() const
1065 {
1066 static_assert(nn_ <= n_, "invalid norm_euclid_n size");
1067 #ifndef __CUDACC__
1068 return Math::sqrt(this->template norm_euclid_sqr_n<nn_>());
1069 #else
1070 return CudaMath::cuda_sqrt(this->template norm_euclid_sqr_n<nn_>());
1071 #endif
1072 }
1073
1080 CUDA_HOST_DEVICE DataType norm_l1() const
1081 {
1082 DataType r(DataType(0));
1083 for(int i(0); i < n_; ++i)
1084 {
1085 #ifndef __CUDACC__
1086 r += Math::abs(v[i]);
1087 #else
1088 r += CudaMath::cuda_abs(v[i]);
1089 #endif
1090 }
1091 return r;
1092 }
1093
1100 template<int nn_>
1101 CUDA_HOST_DEVICE DataType norm_l1_n() const
1102 {
1103 static_assert(nn_ <= n_, "invalid norm_l1_n size");
1104 DataType r(DataType(0));
1105 for(int i(0); i < nn_; ++i)
1106 {
1107 #ifndef __CUDACC__
1108 r += Math::abs(v[i]);
1109 #else
1110 r += CudaMath::cuda_abs(v[i]);
1111 #endif
1112 }
1113 return r;
1114 }
1115
1122 CUDA_HOST_DEVICE DataType norm_max() const
1123 {
1124 DataType r(DataType(0));
1125 for(int i(0); i < n_; ++i)
1126 {
1127 #ifndef __CUDACC__
1128 r = Math::max(r, Math::abs(v[i]));
1129 #else
1130 r = CudaMath::cuda_max(r, CudaMath::cuda_abs(v[i]));
1131 #endif
1132 }
1133 return r;
1134 }
1135
1142 template<int nn_>
1143 CUDA_HOST_DEVICE DataType norm_max_n() const
1144 {
1145 static_assert(nn_ <= n_, "invalid norm_l1_n size");
1146 DataType r(DataType(0));
1147 for(int i(0); i < nn_; ++i)
1148 {
1149 #ifndef __CUDACC__
1150 r = Math::max(r, Math::abs(v[i]));
1151 #else
1152 r = CudaMath::cuda_max(r, CudaMath::cuda_abs(v[i]));
1153 #endif
1154 }
1155 return r;
1156 }
1157
1161 CUDA_HOST_DEVICE static Vector null()
1162 {
1163 return Vector(DataType(0));
1164 }
1165
1169 CUDA_HOST_DEVICE bool normalized() const
1170 {
1172 }
1173
1180 CUDA_HOST friend std::ostream & operator<< (std::ostream & lhs, const Vector & b)
1181 {
1182 lhs << "[";
1183 for (int i(0) ; i < b.n ; ++i)
1184 {
1185 lhs << " " << stringify(b(i));
1186 }
1187 lhs << "]";
1188
1189 return lhs;
1190 }
1191 }; // class Vector
1192
1193 template<typename T_, int sx_, int sa_>
1194 inline void cross(Vector<T_, 2, sx_>& x, const Vector<T_, 2, sa_>& a)
1195 {
1196 x.v[0] = a.v[1];
1197 x.v[1] = -a.v[0];
1198 }
1199
1200 template<typename T_, int sx_, int sa_, int sb_>
1201 inline void cross(Vector<T_, 3, sx_>& x, const Vector<T_, 3, sa_>& a, const Vector<T_, 3, sb_>& b)
1202 {
1203 x.v[0] = a.v[1]*b.v[2] - a.v[2]*b.v[1];
1204 x.v[1] = a.v[2]*b.v[0] - a.v[0]*b.v[2];
1205 x.v[2] = a.v[0]*b.v[1] - a.v[1]*b.v[0];
1206 }
1207
1209 template<typename T_, int n_, int s_>
1210 CUDA_HOST_DEVICE inline Vector<T_, n_> operator*(typename Vector<T_, n_>::DataType alpha, const Vector<T_, n_, s_>& x)
1211 {
1212 return Vector<T_, n_>(x) *= alpha;
1213 }
1214
1216 template<typename T_, int n_, int s_>
1217 CUDA_HOST_DEVICE inline Vector<T_, n_> operator*(const Vector<T_, n_, s_>& x, typename Vector<T_, n_>::DataType alpha)
1218 {
1219 return Vector<T_, n_>(x) *= alpha;
1220 }
1221
1223 template<typename T_, int n_, int sa_, int sb_>
1225 {
1226 return Vector<T_, n_>(a) *= b;
1227 }
1228
1230 template<typename T_, int n_, int sa_, int sb_>
1231 CUDA_HOST_DEVICE inline Vector<T_, n_> operator+(const Vector<T_, n_, sa_>& a, const Vector<T_, n_, sb_>& b)
1232 {
1233 return Vector<T_, n_>(a) += b;
1234 }
1235
1237 template<typename T_, int n_, int sa_, int sb_>
1238 CUDA_HOST_DEVICE inline Vector<T_, n_> operator-(const Vector<T_, n_, sa_>& a, const Vector<T_, n_, sb_>& b)
1239 {
1240 return Vector<T_, n_>(a) -= b;
1241 }
1242
1249 template<typename T_>
1250 CUDA_HOST inline T_ calculate_opening_angle(const Vector<T_,2>& x, const Vector<T_, 2>& y)
1251 {
1252 #ifdef __CUDACC__
1253 XABORTM("calculate_opening_angle not implemented for CUDA");
1254 return T_(0);
1255 #else
1256 return Math::calc_opening_angle(x[0], x[1], y[0], y[1]);
1257 #endif
1258 }
1259
1268 template<typename T_, int dim_>
1270 {
1271 T_ norm2(y.template norm_euclid_n<dim_>());
1272 if(norm2 < Math::eps<T_>())
1273 norm2 = T_(1);
1274 const auto tmp_normalized = (T_(1)/norm2) * y;
1275 return dot(x, tmp_normalized) * tmp_normalized;
1276 }
1277
1278
1279 /* ************************************************************************************************************* */
1280 /* ************************************************************************************************************* */
1281 // Tiny Matrix implementation
1282 /* ************************************************************************************************************* */
1283 /* ************************************************************************************************************* */
1284
1285 template<typename T_, int m_, int n_, int sm_, int sn_>
1286 class Matrix
1287 {
1288 static_assert(m_ > 0, "invalid row count");
1289 static_assert(n_ > 0, "invalid column count");
1290 static_assert(sm_ >= m_, "invalid row stride");
1291 static_assert(sn_ >= n_, "invalid column stride");
1292
1293 public:
1295 static constexpr int m = m_;
1297 static constexpr int n = n_;
1299 static constexpr int sm = sm_;
1301 static constexpr int sn = sn_;
1302
1304 typedef T_ ValueType;
1306 typedef typename Intern::DataTypeExtractor<ValueType>::MyDataType DataType;
1307
1311 RowType v[sm_];
1312
1314 CUDA_HOST_DEVICE Matrix()
1315 {
1316 }
1317
1319 CUDA_HOST_DEVICE explicit Matrix(DataType value)
1320 {
1321 for(int i(0); i < m_; ++i)
1322 {
1323 v[i] = value;
1324 }
1325 }
1326
1328 template<typename T2_, int sma_, int sna_>
1329 CUDA_HOST_DEVICE Matrix(const Matrix<T2_, m_, n_, sma_, sna_>& a)
1330 {
1331 for(int i(0); i < m_; ++i)
1332 {
1333 for(int j(0); j < n_; ++j)
1334 {
1335 v[i][j] = T_(a.v[i][j]);
1336 }
1337 }
1338 }
1339
1351 template<typename Tx_>
1352 CUDA_HOST_DEVICE explicit Matrix(const std::initializer_list<Tx_>& x)
1353 {
1354 XASSERTM(std::size_t(m_) == x.size(), "invalid initializer list size");
1355 auto it(x.begin());
1356 for(int i(0); i < m_; ++i, ++it)
1357 v[i] = *it;
1358 }
1359
1373 template<typename Tx_>
1374 CUDA_HOST_DEVICE explicit Matrix(const std::initializer_list<std::initializer_list<Tx_>>& x)
1375 {
1376 XASSERTM(std::size_t(m_) == x.size(), "invalid initializer list size");
1377 auto it(x.begin());
1378 for(int i(0); i < m_; ++i, ++it)
1379 v[i] = *it;
1380 }
1381
1383 CUDA_HOST_DEVICE Matrix& operator=(DataType value)
1384 {
1385 for(int i(0); i < m_; ++i)
1386 {
1387 v[i] = value;
1388 }
1389 return *this;
1390 }
1391
1393 template<int sma_, int sna_>
1395 {
1396 for(int i(0); i < m_; ++i)
1397 {
1398 v[i] = a.v[i];
1399 }
1400 return *this;
1401 }
1402
1415 template<typename Tx_>
1416 CUDA_HOST_DEVICE Matrix& operator=(const std::initializer_list<Tx_>& x)
1417 {
1418 XASSERTM(std::size_t(m_) == x.size(), "invalid initializer list size");
1419 auto it(x.begin());
1420 for(int i(0); i < m_; ++i, ++it)
1421 v[i] = *it;
1422 return *this;
1423 }
1424
1436 template<typename Tx_>
1437 CUDA_HOST_DEVICE Matrix& operator=(const std::initializer_list<std::initializer_list<Tx_>>& x)
1438 {
1439 XASSERTM(std::size_t(m_) == x.size(), "invalid initializer list size");
1440 auto it(x.begin());
1441 for(int i(0); i < m_; ++i, ++it)
1442 v[i] = *it;
1443 return *this;
1444 }
1445
1447 template<typename Tx_, int sma_, int sna_>
1448 CUDA_HOST_DEVICE void convert(const Matrix<Tx_, m_, n_, sma_, sna_>& a)
1449 {
1450 for(int i(0); i < m_; ++i)
1451 v[i].convert(a.v[i]);
1452 }
1453
1463 CUDA_HOST_DEVICE T_& operator()(int i, int j)
1464 {
1465 ASSERTM( (i >= 0) && (i < m_), "index i out-of-bounds");
1466 ASSERTM( (j >= 0) && (j < n_), "index j out-of-bounds");
1467 return v[i][j];
1468 }
1469
1471 CUDA_HOST_DEVICE const T_& operator()(int i, int j) const
1472 {
1473 ASSERTM( (i >= 0) && (i < m_), "index i out-of-bounds");
1474 ASSERTM( (j >= 0) && (j < n_), "index j out-of-bounds");
1475 return v[i][j];
1476 }
1477
1487 CUDA_HOST_DEVICE RowType& operator[](int i)
1488 {
1489 ASSERTM( (i >= 0) && (i <m_), "index i out-of-bounds");
1490 return v[i];
1491 }
1492
1494 CUDA_HOST_DEVICE const RowType& operator[](int i) const
1495 {
1496 ASSERTM( (i >= 0) && (i <m_), "index i out-of-bounds");
1497 return v[i];
1498 }
1499
1501 CUDA_HOST_DEVICE Matrix& operator*=(DataType alpha)
1502 {
1503 for(int i(0); i < m_; ++i)
1504 {
1505 v[i] *= alpha;
1506 }
1507 return *this;
1508 }
1509
1511 template<int sma_, int sna_>
1513 {
1514 for(int i(0); i < m_; ++i)
1515 {
1516 v[i] += a.v[i];
1517 }
1518 return *this;
1519 }
1520
1522 template<int sma_, int sna_>
1524 {
1525 for(int i(0); i < m_; ++i)
1526 {
1527 v[i] -= a.v[i];
1528 }
1529 return *this;
1530 }
1531
1538 template<int sma_, int sna_>
1539 CUDA_HOST_DEVICE void copy(const Matrix<T_, m_, n_, sma_, sna_>& a)
1540 {
1541 for(int i(0); i < m_; ++i)
1542 for(int j(0); j < n_; ++j)
1543 v[i][j] = a.v[i][j];
1544 }
1545
1558 template<int mm_, int nn_, int ma_, int na_, int sma_, int sna_>
1559 CUDA_HOST_DEVICE void copy_n(const Matrix<T_, ma_, na_, sma_, sna_>& a)
1560 {
1561 static_assert(mm_ <= m_, "invalid copy_n size");
1562 static_assert(mm_ <= ma_, "invalid copy_n size");
1563 static_assert(nn_ <= n_, "invalid copy_n size");
1564 static_assert(nn_ <= na_, "invalid copy_n size");
1565 for(int i(0); i < mm_; ++i)
1566 for(int j(0); j < nn_; ++j)
1567 v[i][j] = a.v[i][j];
1568 }
1569
1576 CUDA_HOST_DEVICE void format(DataType alpha = DataType(0))
1577 {
1578 for(int i(0); i < m_; ++i)
1579 {
1580 v[i].format(alpha);
1581 }
1582 }
1583
1595 CUDA_HOST_DEVICE DataType norm_hessian_sqr() const
1596 {
1597 DataType r(0);
1598 for(int i(0); i < m_; ++i)
1599 {
1600 #ifndef __CUDACC__
1601 r += Math::sqr(v[i][i]);
1602 #else
1603 r += CudaMath::cuda_sqr(v[i][i]);
1604 #endif
1605 for(int j(0); j < n_; ++j)
1606 {
1607 #ifndef __CUDACC__
1608 r += Math::sqr(v[i][j]);
1609 #else
1610 r += CudaMath::cuda_sqr(v[i][j]);
1611 #endif
1612 }
1613 }
1614 return r / DataType(2);
1615 }
1616
1625 CUDA_HOST_DEVICE DataType norm_frobenius() const
1626 {
1627 #ifndef __CUDACC__
1628 return Math::sqrt(norm_frobenius_sqr());
1629 #else
1630 return CudaMath::cuda_sqrt(norm_frobenius_sqr());
1631 #endif
1632 }
1633
1642 CUDA_HOST_DEVICE DataType norm_frobenius_sqr() const
1643 {
1644 DataType r(0);
1645 for(int i(0); i < m_; ++i)
1646 {
1647 for(int j(0); j < n_; ++j)
1648 {
1649 #ifndef __CUDACC__
1650 r += Math::sqr(v[i][j]);
1651 #else
1652 r += CudaMath::cuda_sqr(v[i][j]);
1653 #endif
1654 }
1655 }
1656 return r;
1657 }
1658
1664 CUDA_HOST_DEVICE DataType norm_sub_id_frobenius() const
1665 {
1666 DataType r(0);
1667 for(int i(0); i < m_; ++i)
1668 {
1669 for(int j(0); j < n_; ++j)
1670 {
1671 #ifndef __CUDACC__
1672 r += Math::sqr(v[i][j] - DataType(i == j ? 1 : 0));
1673 #else
1674 r += CudaMath::cuda_sqr(v[i][j] - DataType(i == j ? 1 : 0));
1675 #endif
1676 }
1677 }
1678 #ifndef __CUDACC__
1679 return Math::sqrt(r);
1680 #else
1681 return CudaMath::cuda_sqrt(r);
1682 #endif
1683 }
1684
1694 CUDA_HOST_DEVICE DataType trace() const
1695 {
1696 #ifndef __CUDACC__
1697 int k = Math::min(m_, n_);
1698 #else
1699 int k = CudaMath::cuda_min(m_, n_);
1700 #endif
1701 DataType r(0);
1702 for(int i(0); i < k; ++i)
1703 {
1704 r += v[i][i];
1705 }
1706 return r;
1707 }
1708
1716 CUDA_HOST_DEVICE DataType det() const
1717 {
1718 return Intern::DetHelper<m_, n_>::compute(*this);
1719 }
1720
1732 CUDA_HOST_DEVICE DataType vol() const
1733 {
1734 return Intern::VolHelper<m_, n_>::compute(*this);
1735 }
1736
1747 template<int sma_, int sna_>
1749 {
1750 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
1751 ASSERTM((const void*)this != (const void*)&a, "result matrix and input matrix 'a' must be different objects");
1752 Intern::InverseHelper<m_, n_>::compute(*this, a);
1753 return *this;
1754 }
1755
1756 #ifdef __CUDACC__
1772 template<typename ThreadGroup_, int sma_, int sna_>
1773 CUDA_HOST_DEVICE __forceinline__ Matrix& grouped_set_inverse(const ThreadGroup_& tg, const Matrix<T_, m_, n_, sma_, sna_>& a, const T_& det)
1774 {
1775 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
1776 ASSERTM((const void*)this != (const void*)&a, "result matrix and input matrix 'a' must be different objects");
1777 Intern::CudaGroupedInverseHelper<m_, n_>::compute(tg, *this, a, det);
1778 return *this;
1779 }
1780 #endif
1781
1799 template<int sma_, int sna_>
1801 {
1802 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
1803 ASSERTM((const void*)this != (const void*)&a, "result matrix and input matrix 'a' must be different objects");
1804 Intern::CofactorHelper<m_, n_>::compute(*this, a);
1805 return *this;
1806 }
1807
1816 template<int sma_, int sna_>
1818 {
1819 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
1820 ASSERTM((const void*)this != (const void*)&a, "result matrix and input matrix 'a' must be different objects");
1821
1822 for(int i(0); i < m_; ++i)
1823 {
1824 for(int j(0); j < n_; ++j)
1825 {
1826 v[i][j] = a.v[j][i];
1827 }
1828 }
1829 return *this;
1830 }
1831
1844 template<int l_, int sla_, int sna_>
1846 {
1847 static_assert(m_ == n_, "Gram matrices must be square");
1848
1849 format();
1850
1851 for(int k(0); k < l_; ++k)
1852 {
1853 for(int i(0); i < n_; ++i)
1854 {
1855 for(int j(0); j < n_; ++j)
1856 {
1857 v[i][j] += a.v[k][i] * a.v[k][j];
1858 }
1859 }
1860 }
1861
1862 return *this;
1863 }
1864
1880 template<int snx_, int sny_>
1881 CUDA_HOST_DEVICE DataType scalar_product(const Vector<T_, m_, snx_>& x, const Vector<T_, n_, sny_>& y) const
1882 {
1883 DataType r(DataType(0));
1884 for(int i(0); i < m_; ++i)
1885 {
1886 r += x[i] * dot(v[i], y);
1887 }
1888 return r;
1889 }
1890
1908 template<int snx_, int sny_>
1909 CUDA_HOST_DEVICE Matrix& add_outer_product(
1910 const Vector<T_, m_, snx_>& x,
1911 const Vector<T_, n_, sny_>& y,
1912 const DataType alpha = DataType(1))
1913 {
1914 for(int i(0); i < m_; ++i)
1915 {
1916 for(int j(0); j < n_; ++j)
1917 {
1918 v[i][j] += alpha * x[i] * y[j];
1919 }
1920 }
1921 return *this;
1922 }
1923
1938 template<int snx_, int sny_>
1939 CUDA_HOST_DEVICE Matrix& set_outer_product(
1940 const Vector<T_, m_, snx_>& x,
1941 const Vector<T_, n_, sny_>& y)
1942 {
1943 for(int i(0); i < m_; ++i)
1944 {
1945 for(int j(0); j < n_; ++j)
1946 {
1947 v[i][j] = x[i] * y[j];
1948 }
1949 }
1950 return *this;
1951 }
1952
1964 template<int sma_, int sna_>
1965 CUDA_HOST_DEVICE Matrix& axpy(DataType alpha, const Matrix<T_, m_, n_, sma_, sna_>& a)
1966 {
1967 for(int i(0); i < m_; ++i)
1968 {
1969 for(int j(0); j < n_; ++j)
1970 {
1971 v[i][j] += alpha * a.v[i][j];
1972 }
1973 }
1974 return *this;
1975 }
1976
1983 CUDA_HOST_DEVICE Matrix& add_scalar_main_diag(DataType alpha)
1984 {
1985 for(int i(0); (i < m_) && (i < n_); ++i)
1986 v[i][i] += alpha;
1987 return *this;
1988 }
1989
2008 template<int la_, int lb_, int sma_, int sna_, int smb_, int snb_>
2009 CUDA_HOST_DEVICE Matrix& add_mat_mat_mult(
2012 DataType alpha = DataType(1))
2013 {
2014 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
2015 ASSERTM((const void*)this != (const void*)&a, "result matrix and multiplicand matrix 'a' must be different objects");
2016 ASSERTM((const void*)this != (const void*)&b, "result matrix and multiplicand matrix 'b' must be different objects");
2017 ASSERTM(la_ == lb_, "second dimension of a must be equal to first dimension of b");
2018
2019 for(int i(0); i < m_; ++i)
2020 {
2021 for(int j(0); j < n_; ++j)
2022 {
2023 DataType r(0);
2024 for(int k(0); k < la_; ++k)
2025 {
2026 r += a.v[i][k] * b.v[k][j];
2027 }
2028 v[i][j] += alpha * r;
2029 }
2030 }
2031 return *this;
2032 }
2033
2049 template<int la_, int lb_, int sma_, int sna_, int smb_, int snb_>
2051 {
2052 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
2053 ASSERTM((const void*)this != (const void*)&a, "result matrix and multiplicand matrix 'a' must be different objects");
2054 ASSERTM((const void*)this != (const void*)&b, "result matrix and multiplicand matrix 'b' must be different objects");
2055 ASSERTM(la_ == lb_, "second dimension of a must be equal to first dimension of b");
2056
2057 format();
2058 return add_mat_mat_mult(a, b);
2059 }
2060
2082 template<int k_, int l_, int sma_, int sna_, int smb_, int snb_, int smd_, int snd_>
2083 CUDA_HOST_DEVICE Matrix& add_double_mat_mult(
2087 DataType alpha = DataType(1))
2088 {
2089 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
2090 ASSERTM((const void*)this != (const void*)&a, "result matrix and multiplicand matrix 'a' must be different objects");
2091 ASSERTM((const void*)this != (const void*)&b, "result matrix and multiplicand matrix 'b' must be different objects");
2092 ASSERTM((const void*)this != (const void*)&d, "result matrix and multiplicand matrix 'd' must be different objects");
2093
2094 for(int i(0); i < m_; ++i)
2095 {
2096 for(int j(0); j < n_; ++j)
2097 {
2098 DataType r(0);
2099 for(int p(0); p < k_; ++p)
2100 {
2101 DataType t(0);
2102 for(int q(0); q < l_; ++q)
2103 {
2104 t += a(p,q) * d(q,j);
2105 }
2106 r += b(p,i)*t;
2107 }
2108 v[i][j] += alpha * r;
2109 }
2110 }
2111 return *this;
2112 }
2113
2135 template<int k_, int l_, int sma_, int sna_, int smb_, int snb_, int smd_, int snd_>
2136 CUDA_HOST_DEVICE Matrix& set_double_mat_mult(
2140 T_ alpha = T_(1))
2141 {
2142 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
2143 ASSERTM((const void*)this != (const void*)&a, "result matrix and multiplicand matrix 'a' must be different objects");
2144 ASSERTM((const void*)this != (const void*)&b, "result matrix and multiplicand matrix 'b' must be different objects");
2145 ASSERTM((const void*)this != (const void*)&d, "result matrix and multiplicand matrix 'd' must be different objects");
2146
2147 format();
2148 return add_double_mat_mult(a, b, d, alpha);
2149 }
2150
2171 template<int l_, int snv_, int slt_, int smt_, int snt_>
2172 CUDA_HOST_DEVICE Matrix& add_vec_tensor_mult(
2173 const Vector<T_, l_, snv_>& x,
2175 DataType alpha = DataType(1))
2176 {
2177 for(int i(0); i < m_; ++i)
2178 {
2179 for(int j(0); j < n_; ++j)
2180 {
2181 DataType r(0);
2182 for(int k(0); k < l_; ++k)
2183 {
2184 r += x(k) * t(k,i,j);
2185 }
2186 v[i][j] += alpha * r;
2187 }
2188 }
2189 return *this;
2190 }
2191
2212 template<int l_, int snv_, int slt_, int smt_, int snt_>
2213 CUDA_HOST_DEVICE Matrix& set_vec_tensor_mult(
2214 const Vector<T_, l_, snv_>& x,
2216 DataType alpha = DataType(1))
2217 {
2218 format();
2219 return add_vec_tensor_mult(x, t, alpha);
2220 }
2221
2227 CUDA_HOST_DEVICE Matrix& set_identity()
2228 {
2229 for(int i(0); i < m_; ++i)
2230 for(int j(0); j < n_; ++j)
2231 v[i][j] = (i == j ? T_(1) : T_(0));
2232 return *this;
2233 }
2234
2243 CUDA_HOST_DEVICE Matrix& set_rotation_2d(T_ angle)
2244 {
2245 static_assert((m_ == 2) && (n_ == 2), "this function works only for 2x2 matrices");
2246 #ifndef __CUDACC__
2247 v[0][0] = (v[1][1] = Math::cos(angle));
2248 v[0][1] = -(v[1][0] = Math::sin(angle));
2249 #else
2250 v[0][0] = (v[1][1] = CudaMath::cuda_cos(angle));
2251 v[0][1] = -(v[1][0] = CudaMath::cuda_sin(angle));
2252 #endif
2253 return *this;
2254 }
2255
2270 CUDA_HOST_DEVICE Matrix& set_rotation_3d(T_ yaw, T_ pitch, T_ roll)
2271 {
2272 static_assert((m_ == 3) && (n_ == 3), "this function works only for 3x3 matrices");
2273 #ifndef __CUDACC__
2274 const T_ cy = Math::cos(yaw);
2275 const T_ sy = Math::sin(yaw);
2276 const T_ cp = Math::cos(pitch);
2277 const T_ sp = Math::sin(pitch);
2278 const T_ cr = Math::cos(roll);
2279 const T_ sr = Math::sin(roll);
2280 #else
2281 const T_ cy = CudaMath::cuda_cos(yaw);
2282 const T_ sy = CudaMath::cuda_sin(yaw);
2283 const T_ cp = CudaMath::cuda_cos(pitch);
2284 const T_ sp = CudaMath::cuda_sin(pitch);
2285 const T_ cr = CudaMath::cuda_cos(roll);
2286 const T_ sr = CudaMath::cuda_sin(roll);
2287 #endif
2288 v[0][0] = cy*cp;
2289 v[0][1] = cy*sp*sr - sy*cr;
2290 v[0][2] = cy*sp*cr + sy*sr;
2291 v[1][0] = sy*cp;
2292 v[1][1] = sy*sp*sr + cy*cr;
2293 v[1][2] = sy*sp*cr - cy*sr;
2294 v[2][0] = -sp;
2295 v[2][1] = cp*sr;
2296 v[2][2] = cp*cr;
2297 return *this;
2298 }
2299
2303 CUDA_HOST_DEVICE static Matrix null()
2304 {
2305 return Matrix(DataType(0));
2306 }
2307
2314 CUDA_HOST friend std::ostream & operator<< (std::ostream & lhs, const Matrix& A)
2315 {
2316 for (int i(0) ; i < m-1 ; ++i)
2317 {
2318 lhs << A[i] << "\n";
2319 }
2320 lhs << A[m-1];
2321
2322 return lhs;
2323 }
2324 }; // class Matrix
2325
2335 template<typename T_, int mx_, int smx_, int ma_, int na_, int sm_, int sn_>
2337 {
2338 static_assert(mx_ >= 2, "invalid nu vector size for orthogonal_2x1");
2339 static_assert(ma_ >= 2, "invalid matrix row size for orthogonal_2x1");
2340 static_assert(na_ >= 1, "invalid matrix column size for orthogonal_2x1");
2341
2342 // 2d "cross" product. The sign has to be on the second component so the input is rotated in negative direction
2343 nu[0] = tau[1][0];
2344 nu[1] = -tau[0][0];
2345 }
2346
2356 template<typename T_, int mx_, int smx_, int ma_, int na_, int sm_, int sn_>
2358 {
2359 static_assert(mx_ >= 3, "invalid nu vector size for orthogonal_3x2");
2360 static_assert(ma_ >= 3, "invalid matrix row size for orthogonal_3x2");
2361 static_assert(na_ >= 2, "invalid matrix column size for orthogonal_3x2");
2362
2363 // 3d cross product
2364 nu[0] = tau[1][0]*tau[2][1] - tau[2][0]*tau[1][1];
2365 nu[1] = tau[2][0]*tau[0][1] - tau[0][0]*tau[2][1];
2366 nu[2] = tau[0][0]*tau[1][1] - tau[1][0]*tau[0][1];
2367 }
2368
2369#ifdef DOXYGEN
2393 template<typename T_, int m_, int sm_, int sn_>
2395#endif
2396
2398 template<typename T_, int sm_, int sn_>
2399 CUDA_HOST_DEVICE Vector<T_, 2> orthogonal(const Matrix<T_, 2, 1, sm_, sn_>& tau)
2400 {
2401 Vector<T_, 2, sm_> nu(T_(0));
2402 orthogonal_2x1(nu, tau);
2403 return nu;
2404 }
2405
2406 template<typename T_, int sm_, int sn_>
2407 CUDA_HOST_DEVICE Vector<T_, 3> orthogonal(const Matrix<T_, 3, 2, sm_, sn_>& tau)
2408 {
2409 Vector<T_, 3, sm_> nu(T_(0));
2410 orthogonal_3x2(nu, tau);
2411 return nu;
2412 }
2414
2416 template<typename T_, int m_, int n_, int sm_, int sn_, int sx_>
2418 {
2419 return Vector<T_, m_>().set_mat_vec_mult(a, x);
2420 }
2421
2423 template<typename T_, int m_, int n_, int sm_, int sn_, int sx_>
2425 {
2426 return Vector<T_, n_>().set_vec_mat_mult(x, a);
2427 }
2428
2430 template<typename T_, int m_, int n_, int sm_, int sn_>
2432 {
2433 return Matrix<T_, m_, n_>(a) *= alpha;
2434 }
2435
2437 template<typename T_, int m_, int n_, int sm_, int sn_>
2439 {
2440 return Matrix<T_, m_, n_>(a) *= alpha;
2441 }
2442
2444 template<typename T_, int m_, int n_, int l_, int sma_, int sna_, int smb_, int snb_>
2446 {
2447 return Matrix<T_, m_, n_>().set_mat_mat_mult(a, b);
2448 }
2449
2451 template<typename T_, int m_, int n_,int sma_, int sna_, int smb_, int snb_>
2453 {
2454 return Matrix<T_, m_, n_>(a) += b;
2455 }
2456
2458 template<typename T_, int m_, int n_,int sma_, int sna_, int smb_, int snb_>
2460 {
2461 return Matrix<T_, m_, n_>(a) -= b;
2462 }
2463
2464 /* ************************************************************************************************************* */
2465 /* ************************************************************************************************************* */
2466 // Tiny Tensor3 implementation
2467 /* ************************************************************************************************************* */
2468 /* ************************************************************************************************************* */
2469
2470 template<typename T_, int l_, int m_, int n_, int sl_, int sm_, int sn_>
2471 class Tensor3
2472 {
2473 static_assert(l_ > 0, "invalid tube count");
2474 static_assert(m_ > 0, "invalid row count");
2475 static_assert(n_ > 0, "invalid column count");
2476 static_assert(sl_ >= l_, "invalid tube stride");
2477 static_assert(sm_ >= m_, "invalid row stride");
2478 static_assert(sn_ >= n_, "invalid column stride");
2479
2480 public:
2482 static constexpr int l = l_;
2484 static constexpr int m = m_;
2486 static constexpr int n = n_;
2488 static constexpr int sl = sl_;
2490 static constexpr int sm = sm_;
2492 static constexpr int sn = sn_;
2493
2495 typedef T_ ValueType;
2497 typedef typename Intern::DataTypeExtractor<ValueType>::MyDataType DataType;
2498
2503
2505 CUDA_HOST_DEVICE Tensor3()
2506 {
2507 }
2508
2510 CUDA_HOST_DEVICE explicit Tensor3(DataType value)
2511 {
2512 for(int i(0); i < l_; ++i)
2513 v[i] = value;
2514 }
2515
2522 template<typename Tx_>
2523 CUDA_HOST_DEVICE explicit Tensor3(const std::initializer_list<Tx_>& x)
2524 {
2525 XASSERTM(std::size_t(l_) == x.size(), "invalid initializer list size");
2526 auto it(x.begin());
2527 for(int i(0); i < l_; ++i, ++it)
2528 v[i] = *it;
2529 }
2530
2537 template<typename Tx_>
2538 CUDA_HOST_DEVICE explicit Tensor3(const std::initializer_list<std::initializer_list<std::initializer_list<Tx_>>>& x)
2539 {
2540 XASSERTM(std::size_t(l_) == x.size(), "invalid initializer list size");
2541 auto it(x.begin());
2542 for(int i(0); i < l_; ++i, ++it)
2543 v[i] = *it;
2544 }
2545
2547 template<int sla_, int sma_, int sna_>
2549 {
2550 for(int i(0); i < l_; ++i)
2551 v[i] = a.v[i];
2552 }
2553
2555 CUDA_HOST_DEVICE Tensor3& operator=(DataType value)
2556 {
2557 for(int i(0); i < l_; ++i)
2558 v[i] = value;
2559 return *this;
2560 }
2561
2563 template<int sla_, int sma_, int sna_>
2565 {
2566 for(int i(0); i < l_; ++i)
2567 v[i] = a.v[i];
2568 return *this;
2569 }
2570
2579 template<typename Tx_>
2580 CUDA_HOST_DEVICE Tensor3& operator=(const std::initializer_list<Tx_>& x)
2581 {
2582 XASSERTM(std::size_t(l_) == x.size(), "invalid initializer list size");
2583 auto it(x.begin());
2584 for(int i(0); i < l_; ++i, ++it)
2585 v[i] = *it;
2586 return *this;
2587 }
2588
2597 template<typename Tx_>
2598 CUDA_HOST_DEVICE Tensor3& operator=(const std::initializer_list<std::initializer_list<std::initializer_list<Tx_>>>& x)
2599 {
2600 XASSERTM(std::size_t(l_) == x.size(), "invalid initializer list size");
2601 auto it(x.begin());
2602 for(int i(0); i < l_; ++i, ++it)
2603 v[i] = *it;
2604 return *this;
2605 }
2606
2608 template<typename Tx_, int sla_, int sma_, int sna_>
2610 {
2611 for(int i(0); i < l_; ++i)
2612 v[i].convert(a.v[i]);
2613 }
2614
2624 CUDA_HOST_DEVICE T_& operator()(int h, int i, int j)
2625 {
2626 ASSERTM( (h >= 0) && (h < l_), "index h out-of-bounds");
2627 ASSERTM( (i >= 0) && (i < m_), "index i out-of-bounds");
2628 ASSERTM( (j >= 0) && (j < n_), "index j out-of-bounds");
2629 return v[h](i,j);
2630 }
2631
2633 CUDA_HOST_DEVICE const T_& operator()(int h, int i, int j) const
2634 {
2635 ASSERTM( (h >= 0) && (h < l_), "index h out-of-bounds");
2636 ASSERTM( (i >= 0) && (i < m_), "index i out-of-bounds");
2637 ASSERTM( (j >= 0) && (j < n_), "index j out-of-bounds");
2638 return v[h](i,j);
2639 }
2640
2650 CUDA_HOST_DEVICE PlaneType& operator[](int h)
2651 {
2652 ASSERTM( (h >= 0) && (h < l_), "index h out-of-bounds");
2653 return v[h];
2654 }
2655
2657 CUDA_HOST_DEVICE const PlaneType& operator[](int h) const
2658 {
2659 ASSERTM( (h >= 0) && (h < l_), "index h out-of-bounds");
2660 return v[h];
2661 }
2662
2664 CUDA_HOST_DEVICE Tensor3& operator*=(DataType alpha)
2665 {
2666 for(int i(0); i < l_; ++i)
2667 v[i] *= alpha;
2668 return *this;
2669 }
2670
2672 template<int sla_, int sma_, int sna_>
2674 {
2675 for(int i(0); i < l_; ++i)
2676 v[i] += a.v[i];
2677 return *this;
2678 }
2679
2681 template<int sla_, int sma_, int sna_>
2683 {
2684 for(int i(0); i < l_; ++i)
2685 v[i] -= a.v[i];
2686 return *this;
2687 }
2688
2695 template<int sla_, int sma_, int sna_>
2696 CUDA_HOST_DEVICE void copy(const Tensor3<T_, l_, m_, n_, sla_, sma_, sna_>& a)
2697 {
2698 for(int i(0); i < l_; ++i)
2699 for(int j(0); j < m_; ++j)
2700 for(int k(0); k < n_; ++k)
2701 v[i][j][k] = a.v[i][j][k];
2702 }
2703
2716 template<int ll_,int mm_, int nn_, int la_, int ma_, int na_, int sla_, int sma_, int sna_>
2718 {
2719 static_assert(ll_ <= l_, "invalid copy_n size");
2720 static_assert(ll_ <= la_, "invalid copy_n size");
2721 static_assert(mm_ <= m_, "invalid copy_n size");
2722 static_assert(mm_ <= ma_, "invalid copy_n size");
2723 static_assert(nn_ <= n_, "invalid copy_n size");
2724 static_assert(nn_ <= na_, "invalid copy_n size");
2725 for(int i(0); i < ll_; ++i)
2726 for(int j(0); j < mm_; ++j)
2727 for(int k(0); k < nn_; ++k)
2728 v[i][j][k] = a.v[i][j][k];
2729 }
2730
2732 CUDA_HOST_DEVICE void format(DataType alpha = DataType(0))
2733 {
2734 (*this) = alpha;
2735 }
2736
2757 template<int k_, int sma_, int sna_, int slt_, int smt_, int snt_>
2758 CUDA_HOST_DEVICE Tensor3& add_mat_tensor_mult(
2761 DataType alpha = DataType(1))
2762 {
2763 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
2764 ASSERTM((const void*)this != (const void*)&t, "result tensor and multiplicand tensor 't' must be different objects");
2765
2766 for(int h(0); h < l_; ++h)
2767 {
2768 for(int i(0); i < m_; ++i)
2769 {
2770 for(int j(0); j < n_; ++j)
2771 {
2772 DataType r(0);
2773 for(int p(0); p < k_; ++p)
2774 {
2775 r += a(h,p) * t(p,i,j);
2776 }
2777 operator()(h,i,j) += alpha * r;
2778 }
2779 }
2780 }
2781 return *this;
2782 }
2783
2808 template<
2809 int lt_, int mt_, int nt_, // input tensor dimensions
2810 int slt_, int smt_, int snt_, // input tensor strides
2811 int smb_, int snb_, int smd_, int snd_> // input matrix strides
2812 CUDA_HOST_DEVICE Tensor3& add_double_mat_mult(
2816 DataType alpha = DataType(1))
2817 {
2818 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
2819 ASSERTM((const void*)this != (const void*)&t, "result tensor and multiplicand tensor 't' must be different objects");
2820
2821 for(int h(0); h < l_; ++h)
2822 {
2823 for(int i(0); i < m_; ++i)
2824 {
2825 for(int j(0); j < n_; ++j)
2826 {
2827 DataType r(0);
2828 for(int p(0); p < mt_; ++p)
2829 {
2830 for(int q(0); q < nt_; ++q)
2831 {
2832 r += t(h,p,q) * b(p,i) * d(q,j);
2833 }
2834 }
2835 operator()(h,i,j) += alpha * r;
2836 }
2837 }
2838 }
2839 return *this;
2840 }
2841
2862 template<int slx_, int sma_, int sna_>
2864 const Vector<T_, l_, slx_>& x,
2866 DataType alpha = DataType(1))
2867 {
2868 for(int h(0); h < l_; ++h)
2869 {
2870 for(int i(0); i < m_; ++i)
2871 {
2872 for(int j(0); j < n_; ++j)
2873 {
2874 operator()(h,i,j) += alpha * x(h) * a(i, j);
2875 }
2876 }
2877 }
2878 return *this;
2879 }
2880
2884 CUDA_HOST_DEVICE static Tensor3 null()
2885 {
2886 return Tensor3(DataType(0));
2887 }
2888 }; // class Tensor3<...>
2889
2891 template<typename T_, int l_, int m_, int n_, int sl_, int sm_, int sn_>
2894 {
2895 return Tensor3<T_, l_, m_, n_, sl_, sm_, sn_>(a) *= alpha;
2896 }
2897
2899 template<typename T_, int l_, int m_, int n_, int sl_, int sm_, int sn_>
2902 {
2903 return Tensor3<T_, l_, m_, n_, sl_, sm_, sn_>(a) *= alpha;
2904 }
2905
2906 /* ************************************************************************************************************* */
2907 /* ************************************************************************************************************* */
2908 // Various helper functions
2909 /* ************************************************************************************************************* */
2910 /* ************************************************************************************************************* */
2922 template<typename T_, typename std::enable_if<Intern::DataTypeExtractor<T_>::level == 0, bool>::type = true>
2923 CUDA_HOST_DEVICE inline T_ dot(const T_& a, const T_& b)
2924 {
2925 return a*b;
2926 }
2927
2939 template<typename T_, int n_, int sa_, int sb_>
2940 CUDA_HOST_DEVICE inline typename Vector<T_, n_>::DataType dot(const Vector<T_, n_, sa_>& a, const Vector<T_, n_, sb_>& b)
2941 {
2942 typename Vector<T_, n_>::DataType r(0);
2943
2944 for(int i(0); i < n_; ++i)
2945 {
2946 r += Tiny::dot(a.v[i], b.v[i]);
2947 }
2948 return r;
2949 }
2950
2962 template<typename T_, int m_, int n_, int sma_, int sna_, int smb_, int snb_>
2964 {
2965 typename Matrix<T_, m_, n_>::DataType r(0);
2966 for(int i(0); i < m_; ++i)
2967 {
2968 r += Tiny::dot(a.v[i], b.v[i]);
2969 }
2970 return r;
2971 }
2972
2981 template<typename T_, int l_, int m_, int n_, int sla_ ,int sma_, int sna_, int slb_, int smb_, int snb_>
2982 CUDA_HOST_DEVICE inline typename Tensor3<T_, l_, m_, n_>::DataType dot(
2985 {
2987 for(int i(0); i < l_; ++i)
2988 {
2989 r += Tiny::dot(a.v[i], b.v[i]);
2990 }
2991 return r;
2992 }
2993
3003 template<typename T_>
3004 CUDA_HOST_DEVICE inline void add_id(T_& x, const T_& alpha)
3005 {
3006 x += alpha;
3007 }
3008
3018 template<typename T_, int n_, int sn_>
3019 CUDA_HOST_DEVICE inline void add_id(Vector<T_, n_, sn_>& x, const typename Vector<T_, n_, sn_>::DataType& alpha)
3020 {
3021 for(int i(0); i < n_; ++i)
3022 add_id(x(i), alpha);
3023 }
3024
3034 template<typename T_, int n_, int sm_, int sn_>
3035 CUDA_HOST_DEVICE inline void add_id(Matrix<T_, n_, n_, sm_, sn_>& x, const typename Matrix<T_, n_, n_, sm_, sn_>::DataType& alpha)
3036 {
3037 for(int i(0); i < n_; ++i)
3038 add_id(x(i,i), alpha);
3039 }
3040
3050 template<typename T_, int n_, int sl_, int sm_, int sn_>
3052 {
3053 for(int i(0); i < n_; ++i)
3054 add_id(x(i,i,i), alpha);
3055 }
3056
3071 template<typename T_>
3072 CUDA_HOST_DEVICE inline void axpy(T_& y, const T_& x, const T_& alpha)
3073 {
3074 y += alpha*x;
3075 }
3076
3091 template<typename T_, int n_, int sn_>
3092 CUDA_HOST_DEVICE inline void axpy(
3094 const Vector<T_, n_, sn_>& x,
3095 const typename Vector<T_, n_, sn_>::DataType& alpha)
3096 {
3097 for(int i(0); i < n_; ++i)
3098 axpy(y.v[i], x.v[i], alpha);
3099 }
3100
3115 template<typename T_, int m_, int n_, int sm_, int sn_>
3116 CUDA_HOST_DEVICE inline void axpy(
3119 const typename Matrix<T_, m_, n_, sm_, sn_>::DataType& alpha)
3120 {
3121 for(int i(0); i < m_; ++i)
3122 axpy(y.v[i], x.v[i], alpha);
3123 }
3124
3139 template<typename T_, int l_, int m_, int n_, int sl_, int sm_, int sn_>
3140 CUDA_HOST_DEVICE inline void axpy(
3144 {
3145 for(int i(0); i < l_; ++i)
3146 axpy(y.v[i], x.v[i], alpha);
3147 }
3148
3149 /* ************************************************************************************************************* */
3150 /* ************************************************************************************************************* */
3151 // Internal helpers implementation
3152 /* ************************************************************************************************************* */
3153 /* ************************************************************************************************************* */
3154
3156 namespace Intern
3157 {
3158 // generic square matrix inversion:
3159 template<int n_>
3160 struct DetHelper<n_,n_>
3161 {
3162 template<typename T_, int sma_, int sna_>
3163 CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, n_, n_, sma_, sna_>& a)
3164 {
3165 // perform matrix inversion which returns the determinant
3167 const T_ det = Intern::InverseHelper<n_, n_>::compute(b, a);
3168
3169 // if the returned value is not normal, we can assume that the matrix is singular
3170 #ifndef __CUDACC__
3171 return Math::isnormal(det) ? det : T_(0);
3172 #else
3173 return CudaMath::cuda_isnormal(det) ? det : T_(0);
3174 #endif
3175 }
3176 };
3177
3178 template<>
3179 struct DetHelper<1,1>
3180 {
3181 template<typename T_, int sma_, int sna_>
3182 CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, 1, 1, sma_, sna_>& a)
3183 {
3184 return a(0,0);
3185 }
3186 };
3187
3188
3189 template<>
3190 struct DetHelper<2,2>
3191 {
3192 template<typename T_, int sma_, int sna_>
3193 CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, 2, 2, sma_, sna_>& a)
3194 {
3195 return a(0,0)*a(1,1) - a(0,1)*a(1,0);
3196 }
3197 };
3198
3199 template<>
3200 struct DetHelper<3,3>
3201 {
3202 template<typename T_, int sma_, int sna_>
3203 CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, 3, 3, sma_, sna_>& a)
3204 {
3205 return a(0,0)*(a(1,1)*a(2,2) - a(1,2)*a(2,1))
3206 + a(0,1)*(a(1,2)*a(2,0) - a(1,0)*a(2,2))
3207 + a(0,2)*(a(1,0)*a(2,1) - a(1,1)*a(2,0));
3208 }
3209 };
3210
3211 template<>
3212 struct DetHelper<4,4>
3213 {
3214 template<typename T_, int sma_, int sna_>
3215 CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, 4, 4, sma_, sna_>& a)
3216 {
3217 // 2x2 determinants of rows 3-4
3218 T_ w[6] =
3219 {
3220 a(2,0)*a(3,1) - a(2,1)*a(3,0),
3221 a(2,0)*a(3,2) - a(2,2)*a(3,0),
3222 a(2,0)*a(3,3) - a(2,3)*a(3,0),
3223 a(2,1)*a(3,2) - a(2,2)*a(3,1),
3224 a(2,1)*a(3,3) - a(2,3)*a(3,1),
3225 a(2,2)*a(3,3) - a(2,3)*a(3,2)
3226 };
3227
3228 return
3229 + a(0,0) * (a(1,1)*w[5] - a(1,2)*w[4] + a(1,3)*w[3])
3230 - a(0,1) * (a(1,0)*w[5] - a(1,2)*w[2] + a(1,3)*w[1])
3231 + a(0,2) * (a(1,0)*w[4] - a(1,1)*w[2] + a(1,3)*w[0])
3232 - a(0,3) * (a(1,0)*w[3] - a(1,1)*w[1] + a(1,2)*w[0]);
3233 }
3234 };
3235
3236 template<>
3237 struct DetHelper<5,5>
3238 {
3239 template<typename T_, int sma_, int sna_>
3240 CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, 5, 5, sma_, sna_>& a)
3241 {
3242 // 2x2 determinants of rows 4-5
3243 T_ v[10] =
3244 {
3245 a(3,0)*a(4,1) - a(3,1)*a(4,0),
3246 a(3,0)*a(4,2) - a(3,2)*a(4,0),
3247 a(3,0)*a(4,3) - a(3,3)*a(4,0),
3248 a(3,0)*a(4,4) - a(3,4)*a(4,0),
3249 a(3,1)*a(4,2) - a(3,2)*a(4,1),
3250 a(3,1)*a(4,3) - a(3,3)*a(4,1),
3251 a(3,1)*a(4,4) - a(3,4)*a(4,1),
3252 a(3,2)*a(4,3) - a(3,3)*a(4,2),
3253 a(3,2)*a(4,4) - a(3,4)*a(4,2),
3254 a(3,3)*a(4,4) - a(3,4)*a(4,3)
3255 };
3256 // 3x3 determinants of rows 3-4-5
3257 T_ w[10] =
3258 {
3259 a(2,0)*v[4] - a(2,1)*v[1] + a(2,2)*v[0],
3260 a(2,0)*v[5] - a(2,1)*v[2] + a(2,3)*v[0],
3261 a(2,0)*v[6] - a(2,1)*v[3] + a(2,4)*v[0],
3262 a(2,0)*v[7] - a(2,2)*v[2] + a(2,3)*v[1],
3263 a(2,0)*v[8] - a(2,2)*v[3] + a(2,4)*v[1],
3264 a(2,0)*v[9] - a(2,3)*v[3] + a(2,4)*v[2],
3265 a(2,1)*v[7] - a(2,2)*v[5] + a(2,3)*v[4],
3266 a(2,1)*v[8] - a(2,2)*v[6] + a(2,4)*v[4],
3267 a(2,1)*v[9] - a(2,3)*v[6] + a(2,4)*v[5],
3268 a(2,2)*v[9] - a(2,3)*v[8] + a(2,4)*v[7]
3269 };
3270
3271 return
3272 + a(0,0)*(a(1,1)*w[9] - a(1,2)*w[8] + a(1,3)*w[7] - a(1,4)*w[6])
3273 - a(0,1)*(a(1,0)*w[9] - a(1,2)*w[5] + a(1,3)*w[4] - a(1,4)*w[3])
3274 + a(0,2)*(a(1,0)*w[8] - a(1,1)*w[5] + a(1,3)*w[2] - a(1,4)*w[1])
3275 - a(0,3)*(a(1,0)*w[7] - a(1,1)*w[4] + a(1,2)*w[2] - a(1,4)*w[0])
3276 + a(0,4)*(a(1,0)*w[6] - a(1,1)*w[3] + a(1,2)*w[1] - a(1,3)*w[0]);
3277 }
3278 };
3279
3280 template<>
3281 struct DetHelper<6,6>
3282 {
3283 template<typename T_, int sma_, int sna_>
3284 CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, 6, 6, sma_, sna_>& a)
3285 {
3286 // 2x2 determinants of rows 5-6
3287 T_ v[15] =
3288 {
3289 a(4,0)*a(5,1) - a(4,1)*a(5,0),
3290 a(4,0)*a(5,2) - a(4,2)*a(5,0),
3291 a(4,0)*a(5,3) - a(4,3)*a(5,0),
3292 a(4,0)*a(5,4) - a(4,4)*a(5,0),
3293 a(4,0)*a(5,5) - a(4,5)*a(5,0),
3294 a(4,1)*a(5,2) - a(4,2)*a(5,1),
3295 a(4,1)*a(5,3) - a(4,3)*a(5,1),
3296 a(4,1)*a(5,4) - a(4,4)*a(5,1),
3297 a(4,1)*a(5,5) - a(4,5)*a(5,1),
3298 a(4,2)*a(5,3) - a(4,3)*a(5,2),
3299 a(4,2)*a(5,4) - a(4,4)*a(5,2),
3300 a(4,2)*a(5,5) - a(4,5)*a(5,2),
3301 a(4,3)*a(5,4) - a(4,4)*a(5,3),
3302 a(4,3)*a(5,5) - a(4,5)*a(5,3),
3303 a(4,4)*a(5,5) - a(4,5)*a(5,4)
3304 };
3305 // 3x3 determinants of rows 4-5-6
3306 T_ w[20] =
3307 {
3308 a(3,0)*v[ 5] - a(3,1)*v[ 1] + a(3,2)*v[ 0],
3309 a(3,0)*v[ 6] - a(3,1)*v[ 2] + a(3,3)*v[ 0],
3310 a(3,0)*v[ 7] - a(3,1)*v[ 3] + a(3,4)*v[ 0],
3311 a(3,0)*v[ 8] - a(3,1)*v[ 4] + a(3,5)*v[ 0],
3312 a(3,0)*v[ 9] - a(3,2)*v[ 2] + a(3,3)*v[ 1],
3313 a(3,0)*v[10] - a(3,2)*v[ 3] + a(3,4)*v[ 1],
3314 a(3,0)*v[11] - a(3,2)*v[ 4] + a(3,5)*v[ 1],
3315 a(3,0)*v[12] - a(3,3)*v[ 3] + a(3,4)*v[ 2],
3316 a(3,0)*v[13] - a(3,3)*v[ 4] + a(3,5)*v[ 2],
3317 a(3,0)*v[14] - a(3,4)*v[ 4] + a(3,5)*v[ 3],
3318 a(3,1)*v[ 9] - a(3,2)*v[ 6] + a(3,3)*v[ 5],
3319 a(3,1)*v[10] - a(3,2)*v[ 7] + a(3,4)*v[ 5],
3320 a(3,1)*v[11] - a(3,2)*v[ 8] + a(3,5)*v[ 5],
3321 a(3,1)*v[12] - a(3,3)*v[ 7] + a(3,4)*v[ 6],
3322 a(3,1)*v[13] - a(3,3)*v[ 8] + a(3,5)*v[ 6],
3323 a(3,1)*v[14] - a(3,4)*v[ 8] + a(3,5)*v[ 7],
3324 a(3,2)*v[12] - a(3,3)*v[10] + a(3,4)*v[ 9],
3325 a(3,2)*v[13] - a(3,3)*v[11] + a(3,5)*v[ 9],
3326 a(3,2)*v[14] - a(3,4)*v[11] + a(3,5)*v[10],
3327 a(3,3)*v[14] - a(3,4)*v[13] + a(3,5)*v[12]
3328 };
3329 // 4x4 determinants of rows 3-4-5-6
3330 v[ 0] = a(2,0)*w[10] - a(2,1)*w[ 4] + a(2,2)*w[ 1] - a(2,3)*w[ 0];
3331 v[ 1] = a(2,0)*w[11] - a(2,1)*w[ 5] + a(2,2)*w[ 2] - a(2,4)*w[ 0];
3332 v[ 2] = a(2,0)*w[12] - a(2,1)*w[ 6] + a(2,2)*w[ 3] - a(2,5)*w[ 0];
3333 v[ 3] = a(2,0)*w[13] - a(2,1)*w[ 7] + a(2,3)*w[ 2] - a(2,4)*w[ 1];
3334 v[ 4] = a(2,0)*w[14] - a(2,1)*w[ 8] + a(2,3)*w[ 3] - a(2,5)*w[ 1];
3335 v[ 5] = a(2,0)*w[15] - a(2,1)*w[ 9] + a(2,4)*w[ 3] - a(2,5)*w[ 2];
3336 v[ 6] = a(2,0)*w[16] - a(2,2)*w[ 7] + a(2,3)*w[ 5] - a(2,4)*w[ 4];
3337 v[ 7] = a(2,0)*w[17] - a(2,2)*w[ 8] + a(2,3)*w[ 6] - a(2,5)*w[ 4];
3338 v[ 8] = a(2,0)*w[18] - a(2,2)*w[ 9] + a(2,4)*w[ 6] - a(2,5)*w[ 5];
3339 v[ 9] = a(2,0)*w[19] - a(2,3)*w[ 9] + a(2,4)*w[ 8] - a(2,5)*w[ 7];
3340 v[10] = a(2,1)*w[16] - a(2,2)*w[13] + a(2,3)*w[11] - a(2,4)*w[10];
3341 v[11] = a(2,1)*w[17] - a(2,2)*w[14] + a(2,3)*w[12] - a(2,5)*w[10];
3342 v[12] = a(2,1)*w[18] - a(2,2)*w[15] + a(2,4)*w[12] - a(2,5)*w[11];
3343 v[13] = a(2,1)*w[19] - a(2,3)*w[15] + a(2,4)*w[14] - a(2,5)*w[13];
3344 v[14] = a(2,2)*w[19] - a(2,3)*w[18] + a(2,4)*w[17] - a(2,5)*w[16];
3345
3346 return
3347 + a(0,0)*(a(1,1)*v[14] - a(1,2)*v[13] + a(1,3)*v[12] - a(1,4)*v[11] + a(1,5)*v[10])
3348 - a(0,1)*(a(1,0)*v[14] - a(1,2)*v[ 9] + a(1,3)*v[ 8] - a(1,4)*v[ 7] + a(1,5)*v[ 6])
3349 + a(0,2)*(a(1,0)*v[13] - a(1,1)*v[ 9] + a(1,3)*v[ 5] - a(1,4)*v[ 4] + a(1,5)*v[ 3])
3350 - a(0,3)*(a(1,0)*v[12] - a(1,1)*v[ 8] + a(1,2)*v[ 5] - a(1,4)*v[ 2] + a(1,5)*v[ 1])
3351 + a(0,4)*(a(1,0)*v[11] - a(1,1)*v[ 7] + a(1,2)*v[ 4] - a(1,3)*v[ 2] + a(1,5)*v[ 0])
3352 - a(0,5)*(a(1,0)*v[10] - a(1,1)*v[ 6] + a(1,2)*v[ 3] - a(1,3)*v[ 1] + a(1,4)*v[ 0]);
3353 }
3354 };
3355
3361
3362 template<int m_, int n_>
3363 struct VolHelper
3364 {
3365 template<typename T_, int sma_, int sna_>
3366 CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, n_, n_, sma_, sna_>& a)
3367 {
3368 // generic fallback implementation: compute b := a^T * a and return sqrt(det(b))
3369 Tiny::Matrix<T_, n_, n_> b;
3370 for(int i(0); i < n_; ++i)
3371 {
3372 for(int j(0); j < n_; ++j)
3373 {
3374 b(i,j) = T_(0);
3375 for(int k(0); k < m_; ++k)
3376 {
3377 b(i,j) += a(k,i)*a(k,j);
3378 }
3379 }
3380 }
3381 #ifndef __CUDACC__
3382 return Math::sqrt(DetHelper<n_,n_>::compute(b));
3383 #else
3384 return CudaMath::cuda_sqrt(DetHelper<n_,n_>::compute(b));
3385 #endif
3386 }
3387 };
3388
3389 template<int n_>
3390 struct VolHelper<n_,n_>
3391 {
3392 template<typename T_, int sma_, int sna_>
3393 CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, n_, n_, sma_, sna_>& a)
3394 {
3395 // square matrix special case: vol(a) = abs(det(a))
3396 #ifndef __CUDACC__
3397 return Math::abs(DetHelper<n_,n_>::compute(a));
3398 #else
3399 return CudaMath::cuda_abs(DetHelper<n_,n_>::compute(a));
3400 #endif
3401 }
3402 };
3403
3404 template<>
3405 struct VolHelper<2,1>
3406 {
3407 template<typename T_, int sma_, int sna_>
3408 CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, 2, 1, sma_, sna_>& a)
3409 {
3410 // This is the euclid norm of the only matrix column.
3411 #ifndef __CUDACC__
3412 return Math::sqrt(Math::sqr(a(0,0)) + Math::sqr(a(1,0)));
3413 #else
3414 return CudaMath::cuda_sqrt(CudaMath::cuda_sqr(a(0,0)) + CudaMath::cuda_sqr(a(1,0)));
3415 #endif
3416 }
3417 };
3418
3419 template<>
3420 struct VolHelper<3,1>
3421 {
3422 template<typename T_, int sma_, int sna_>
3423 CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, 3, 1, sma_, sna_>& a)
3424 {
3425 // This is the euclid norm of the only matrix column.
3426 #ifndef __CUDACC__
3427 return Math::sqrt(Math::sqr(a(0,0)) + Math::sqr(a(1,0)) + Math::sqr(a(2,0)));
3428 #else
3429 return CudaMath::cuda_sqrt(CudaMath::cuda_sqr(a(0,0)) + CudaMath::cuda_sqr(a(1,0)) + CudaMath::cuda_sqr(a(2,0)));
3430 #endif
3431 }
3432 };
3433
3434 template<>
3435 struct VolHelper<3,2>
3436 {
3437 template<typename T_, int sma_, int sna_>
3438 CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, 3, 2, sma_, sna_>& a)
3439 {
3440 // This is the euclid norm of the 3D cross product of the two matrix columns.
3441 #ifndef __CUDACC__
3442 return Math::sqrt(
3443 Math::sqr(a(1,0)*a(2,1) - a(2,0)*a(1,1)) +
3444 Math::sqr(a(2,0)*a(0,1) - a(0,0)*a(2,1)) +
3445 Math::sqr(a(0,0)*a(1,1) - a(1,0)*a(0,1)));
3446 #else
3447 return CudaMath::cuda_sqrt(
3448 CudaMath::cuda_sqr(a(1,0)*a(2,1) - a(2,0)*a(1,1)) +
3449 CudaMath::cuda_sqr(a(2,0)*a(0,1) - a(0,0)*a(2,1)) +
3450 CudaMath::cuda_sqr(a(0,0)*a(1,1) - a(1,0)*a(0,1)));
3451 #endif
3452 }
3453 };
3454
3460
3461 template<>
3462 struct InverseHelper<1,1>
3463 {
3464 template<typename T_, int smb_, int snb_, int sma_, int sna_>
3465 CUDA_HOST_DEVICE static T_ compute(Tiny::Matrix<T_, 1, 1, smb_, snb_>& b, const Tiny::Matrix<T_, 1, 1, sma_, sna_>& a)
3466 {
3467 b(0,0) = T_(1) / a(0,0);
3468 return a(0,0);
3469 }
3470 };
3471
3472 template<>
3473 struct InverseHelper<2,2>
3474 {
3475 template<typename T_, int smb_, int snb_, int sma_, int sna_>
3476 CUDA_HOST_DEVICE static T_ compute(Tiny::Matrix<T_, 2, 2, smb_, snb_>& b, const Tiny::Matrix<T_, 2, 2, sma_, sna_>& a)
3477 {
3478 T_ det = a(0,0)*a(1,1) - a(0,1)*a(1,0);
3479 T_ d = T_(1) / det;
3480 b(0,0) = d*a(1,1);
3481 b(0,1) = -d*a(0,1);
3482 b(1,0) = -d*a(1,0);
3483 b(1,1) = d*a(0,0);
3484 return det;
3485 }
3486 };
3487
3488 template<>
3489 struct InverseHelper<3,3>
3490 {
3491 template<typename T_, int smb_, int snb_, int sma_, int sna_>
3492 CUDA_HOST_DEVICE static T_ compute(Tiny::Matrix<T_, 3, 3, smb_, snb_>& b, const Tiny::Matrix<T_, 3, 3, sma_, sna_>& a)
3493 {
3494 b(0,0) = a(1,1)*a(2,2) - a(1,2)*a(2,1);
3495 b(1,0) = a(1,2)*a(2,0) - a(1,0)*a(2,2);
3496 b(2,0) = a(1,0)*a(2,1) - a(1,1)*a(2,0);
3497 T_ det = a(0,0)*b(0,0) + a(0,1)*b(1,0) + a(0,2)*b(2,0);
3498 T_ d = T_(1) / det;
3499 b(0,0) *= d;
3500 b(1,0) *= d;
3501 b(2,0) *= d;
3502 b(0,1) = d*(a(0,2)*a(2,1) - a(0,1)*a(2,2));
3503 b(1,1) = d*(a(0,0)*a(2,2) - a(0,2)*a(2,0));
3504 b(2,1) = d*(a(0,1)*a(2,0) - a(0,0)*a(2,1));
3505 b(0,2) = d*(a(0,1)*a(1,2) - a(0,2)*a(1,1));
3506 b(1,2) = d*(a(0,2)*a(1,0) - a(0,0)*a(1,2));
3507 b(2,2) = d*(a(0,0)*a(1,1) - a(0,1)*a(1,0));
3508 return det;
3509 }
3510 };
3511
3512 template<>
3513 struct InverseHelper<4,4>
3514 {
3515 template<typename T_, int smb_, int snb_, int sma_, int sna_>
3516 CUDA_HOST_DEVICE static T_ compute(Tiny::Matrix<T_, 4, 4, smb_, snb_>& b, const Tiny::Matrix<T_, 4, 4, sma_, sna_>& a)
3517 {
3518 T_ w[6];
3519 w[0] = a(2,0)*a(3,1)-a(2,1)*a(3,0);
3520 w[1] = a(2,0)*a(3,2)-a(2,2)*a(3,0);
3521 w[2] = a(2,0)*a(3,3)-a(2,3)*a(3,0);
3522 w[3] = a(2,1)*a(3,2)-a(2,2)*a(3,1);
3523 w[4] = a(2,1)*a(3,3)-a(2,3)*a(3,1);
3524 w[5] = a(2,2)*a(3,3)-a(2,3)*a(3,2);
3525 b(0,0) = a(1,1)*w[5]-a(1,2)*w[4]+a(1,3)*w[3];
3526 b(1,0) =-a(1,0)*w[5]+a(1,2)*w[2]-a(1,3)*w[1];
3527 b(2,0) = a(1,0)*w[4]-a(1,1)*w[2]+a(1,3)*w[0];
3528 b(3,0) =-a(1,0)*w[3]+a(1,1)*w[1]-a(1,2)*w[0];
3529 T_ det = a(0,0)*b(0,0)+a(0,1)*b(1,0)+a(0,2)*b(2,0)+a(0,3)*b(3,0);
3530 T_ d = T_(1) / det;
3531 b(0,0) *= d;
3532 b(1,0) *= d;
3533 b(2,0) *= d;
3534 b(3,0) *= d;
3535 b(0,1) = d*(-a(0,1)*w[5]+a(0,2)*w[4]-a(0,3)*w[3]);
3536 b(1,1) = d*( a(0,0)*w[5]-a(0,2)*w[2]+a(0,3)*w[1]);
3537 b(2,1) = d*(-a(0,0)*w[4]+a(0,1)*w[2]-a(0,3)*w[0]);
3538 b(3,1) = d*( a(0,0)*w[3]-a(0,1)*w[1]+a(0,2)*w[0]);
3539 w[0] = a(0,0)*a(1,1)-a(0,1)*a(1,0);
3540 w[1] = a(0,0)*a(1,2)-a(0,2)*a(1,0);
3541 w[2] = a(0,0)*a(1,3)-a(0,3)*a(1,0);
3542 w[3] = a(0,1)*a(1,2)-a(0,2)*a(1,1);
3543 w[4] = a(0,1)*a(1,3)-a(0,3)*a(1,1);
3544 w[5] = a(0,2)*a(1,3)-a(0,3)*a(1,2);
3545 b(0,2) = d*( a(3,1)*w[5]-a(3,2)*w[4]+a(3,3)*w[3]);
3546 b(1,2) = d*(-a(3,0)*w[5]+a(3,2)*w[2]-a(3,3)*w[1]);
3547 b(2,2) = d*( a(3,0)*w[4]-a(3,1)*w[2]+a(3,3)*w[0]);
3548 b(3,2) = d*(-a(3,0)*w[3]+a(3,1)*w[1]-a(3,2)*w[0]);
3549 b(0,3) = d*(-a(2,1)*w[5]+a(2,2)*w[4]-a(2,3)*w[3]);
3550 b(1,3) = d*( a(2,0)*w[5]-a(2,2)*w[2]+a(2,3)*w[1]);
3551 b(2,3) = d*(-a(2,0)*w[4]+a(2,1)*w[2]-a(2,3)*w[0]);
3552 b(3,3) = d*( a(2,0)*w[3]-a(2,1)*w[1]+a(2,2)*w[0]);
3553 return det;
3554 }
3555 };
3556
3557 template<>
3558 struct InverseHelper<5,5>
3559 {
3560 template<typename T_, int smb_, int snb_, int sma_, int sna_>
3561 CUDA_HOST_DEVICE static T_ compute(Tiny::Matrix<T_, 5, 5, smb_, snb_>& b, const Tiny::Matrix<T_, 5, 5, sma_, sna_>& a)
3562 {
3563 T_ w[20];
3564 w[ 0] = a(3,0)*a(4,1)-a(3,1)*a(4,0);
3565 w[ 1] = a(3,0)*a(4,2)-a(3,2)*a(4,0);
3566 w[ 2] = a(3,0)*a(4,3)-a(3,3)*a(4,0);
3567 w[ 3] = a(3,0)*a(4,4)-a(3,4)*a(4,0);
3568 w[ 4] = a(3,1)*a(4,2)-a(3,2)*a(4,1);
3569 w[ 5] = a(3,1)*a(4,3)-a(3,3)*a(4,1);
3570 w[ 6] = a(3,1)*a(4,4)-a(3,4)*a(4,1);
3571 w[ 7] = a(3,2)*a(4,3)-a(3,3)*a(4,2);
3572 w[ 8] = a(3,2)*a(4,4)-a(3,4)*a(4,2);
3573 w[ 9] = a(3,3)*a(4,4)-a(3,4)*a(4,3);
3574 w[10] = a(2,0)*w[4]-a(2,1)*w[1]+a(2,2)*w[0];
3575 w[11] = a(2,0)*w[5]-a(2,1)*w[2]+a(2,3)*w[0];
3576 w[12] = a(2,0)*w[6]-a(2,1)*w[3]+a(2,4)*w[0];
3577 w[13] = a(2,0)*w[7]-a(2,2)*w[2]+a(2,3)*w[1];
3578 w[14] = a(2,0)*w[8]-a(2,2)*w[3]+a(2,4)*w[1];
3579 w[15] = a(2,0)*w[9]-a(2,3)*w[3]+a(2,4)*w[2];
3580 w[16] = a(2,1)*w[7]-a(2,2)*w[5]+a(2,3)*w[4];
3581 w[17] = a(2,1)*w[8]-a(2,2)*w[6]+a(2,4)*w[4];
3582 w[18] = a(2,1)*w[9]-a(2,3)*w[6]+a(2,4)*w[5];
3583 w[19] = a(2,2)*w[9]-a(2,3)*w[8]+a(2,4)*w[7];
3584 b(0,0) = a(1,1)*w[19]-a(1,2)*w[18]+a(1,3)*w[17]-a(1,4)*w[16];
3585 b(1,0) =-a(1,0)*w[19]+a(1,2)*w[15]-a(1,3)*w[14]+a(1,4)*w[13];
3586 b(2,0) = a(1,0)*w[18]-a(1,1)*w[15]+a(1,3)*w[12]-a(1,4)*w[11];
3587 b(3,0) =-a(1,0)*w[17]+a(1,1)*w[14]-a(1,2)*w[12]+a(1,4)*w[10];
3588 b(4,0) = a(1,0)*w[16]-a(1,1)*w[13]+a(1,2)*w[11]-a(1,3)*w[10];
3589 T_ det = a(0,0)*b(0,0)+a(0,1)*b(1,0)+a(0,2)*b(2,0)+a(0,3)*b(3,0)+a(0,4)*b(4,0);
3590 T_ d = T_(1) / det;
3591 b(0,0) *= d;
3592 b(1,0) *= d;
3593 b(2,0) *= d;
3594 b(3,0) *= d;
3595 b(4,0) *= d;
3596 b(0,1) = d*(-a(0,1)*w[19]+a(0,2)*w[18]-a(0,3)*w[17]+a(0,4)*w[16]);
3597 b(1,1) = d*( a(0,0)*w[19]-a(0,2)*w[15]+a(0,3)*w[14]-a(0,4)*w[13]);
3598 b(2,1) = d*(-a(0,0)*w[18]+a(0,1)*w[15]-a(0,3)*w[12]+a(0,4)*w[11]);
3599 b(3,1) = d*( a(0,0)*w[17]-a(0,1)*w[14]+a(0,2)*w[12]-a(0,4)*w[10]);
3600 b(4,1) = d*(-a(0,0)*w[16]+a(0,1)*w[13]-a(0,2)*w[11]+a(0,3)*w[10]);
3601 w[10] = a(1,0)*w[4]-a(1,1)*w[1]+a(1,2)*w[0];
3602 w[11] = a(1,0)*w[5]-a(1,1)*w[2]+a(1,3)*w[0];
3603 w[12] = a(1,0)*w[6]-a(1,1)*w[3]+a(1,4)*w[0];
3604 w[13] = a(1,0)*w[7]-a(1,2)*w[2]+a(1,3)*w[1];
3605 w[14] = a(1,0)*w[8]-a(1,2)*w[3]+a(1,4)*w[1];
3606 w[15] = a(1,0)*w[9]-a(1,3)*w[3]+a(1,4)*w[2];
3607 w[16] = a(1,1)*w[7]-a(1,2)*w[5]+a(1,3)*w[4];
3608 w[17] = a(1,1)*w[8]-a(1,2)*w[6]+a(1,4)*w[4];
3609 w[18] = a(1,1)*w[9]-a(1,3)*w[6]+a(1,4)*w[5];
3610 w[19] = a(1,2)*w[9]-a(1,3)*w[8]+a(1,4)*w[7];
3611 b(0,2) = d*( a(0,1)*w[19]-a(0,2)*w[18]+a(0,3)*w[17]-a(0,4)*w[16]);
3612 b(1,2) = d*(-a(0,0)*w[19]+a(0,2)*w[15]-a(0,3)*w[14]+a(0,4)*w[13]);
3613 b(2,2) = d*( a(0,0)*w[18]-a(0,1)*w[15]+a(0,3)*w[12]-a(0,4)*w[11]);
3614 b(3,2) = d*(-a(0,0)*w[17]+a(0,1)*w[14]-a(0,2)*w[12]+a(0,4)*w[10]);
3615 b(4,2) = d*( a(0,0)*w[16]-a(0,1)*w[13]+a(0,2)*w[11]-a(0,3)*w[10]);
3616 w[ 0] = a(0,0)*a(1,1)-a(0,1)*a(1,0);
3617 w[ 1] = a(0,0)*a(1,2)-a(0,2)*a(1,0);
3618 w[ 2] = a(0,0)*a(1,3)-a(0,3)*a(1,0);
3619 w[ 3] = a(0,0)*a(1,4)-a(0,4)*a(1,0);
3620 w[ 4] = a(0,1)*a(1,2)-a(0,2)*a(1,1);
3621 w[ 5] = a(0,1)*a(1,3)-a(0,3)*a(1,1);
3622 w[ 6] = a(0,1)*a(1,4)-a(0,4)*a(1,1);
3623 w[ 7] = a(0,2)*a(1,3)-a(0,3)*a(1,2);
3624 w[ 8] = a(0,2)*a(1,4)-a(0,4)*a(1,2);
3625 w[ 9] = a(0,3)*a(1,4)-a(0,4)*a(1,3);
3626 w[10] = a(2,0)*w[4]-a(2,1)*w[1]+a(2,2)*w[0];
3627 w[11] = a(2,0)*w[5]-a(2,1)*w[2]+a(2,3)*w[0];
3628 w[12] = a(2,0)*w[6]-a(2,1)*w[3]+a(2,4)*w[0];
3629 w[13] = a(2,0)*w[7]-a(2,2)*w[2]+a(2,3)*w[1];
3630 w[14] = a(2,0)*w[8]-a(2,2)*w[3]+a(2,4)*w[1];
3631 w[15] = a(2,0)*w[9]-a(2,3)*w[3]+a(2,4)*w[2];
3632 w[16] = a(2,1)*w[7]-a(2,2)*w[5]+a(2,3)*w[4];
3633 w[17] = a(2,1)*w[8]-a(2,2)*w[6]+a(2,4)*w[4];
3634 w[18] = a(2,1)*w[9]-a(2,3)*w[6]+a(2,4)*w[5];
3635 w[19] = a(2,2)*w[9]-a(2,3)*w[8]+a(2,4)*w[7];
3636 b(0,3) = d*( a(4,1)*w[19]-a(4,2)*w[18]+a(4,3)*w[17]-a(4,4)*w[16]);
3637 b(1,3) = d*(-a(4,0)*w[19]+a(4,2)*w[15]-a(4,3)*w[14]+a(4,4)*w[13]);
3638 b(2,3) = d*( a(4,0)*w[18]-a(4,1)*w[15]+a(4,3)*w[12]-a(4,4)*w[11]);
3639 b(3,3) = d*(-a(4,0)*w[17]+a(4,1)*w[14]-a(4,2)*w[12]+a(4,4)*w[10]);
3640 b(4,3) = d*( a(4,0)*w[16]-a(4,1)*w[13]+a(4,2)*w[11]-a(4,3)*w[10]);
3641 b(0,4) = d*(-a(3,1)*w[19]+a(3,2)*w[18]-a(3,3)*w[17]+a(3,4)*w[16]);
3642 b(1,4) = d*( a(3,0)*w[19]-a(3,2)*w[15]+a(3,3)*w[14]-a(3,4)*w[13]);
3643 b(2,4) = d*(-a(3,0)*w[18]+a(3,1)*w[15]-a(3,3)*w[12]+a(3,4)*w[11]);
3644 b(3,4) = d*( a(3,0)*w[17]-a(3,1)*w[14]+a(3,2)*w[12]-a(3,4)*w[10]);
3645 b(4,4) = d*(-a(3,0)*w[16]+a(3,1)*w[13]-a(3,2)*w[11]+a(3,3)*w[10]);
3646 return det;
3647 }
3648 };
3649
3650 template<>
3651 struct InverseHelper<6,6>
3652 {
3653 template<typename T_, int smb_, int snb_, int sma_, int sna_>
3654 CUDA_HOST_DEVICE static T_ compute(Tiny::Matrix<T_, 6, 6, smb_, snb_>& b, const Tiny::Matrix<T_, 6, 6, sma_, sna_>& a)
3655 {
3656 T_ w[35];
3657 w[ 0] = a(4,0)*a(5,1)-a(4,1)*a(5,0);
3658 w[ 1] = a(4,0)*a(5,2)-a(4,2)*a(5,0);
3659 w[ 2] = a(4,0)*a(5,3)-a(4,3)*a(5,0);
3660 w[ 3] = a(4,0)*a(5,4)-a(4,4)*a(5,0);
3661 w[ 4] = a(4,0)*a(5,5)-a(4,5)*a(5,0);
3662 w[ 5] = a(4,1)*a(5,2)-a(4,2)*a(5,1);
3663 w[ 6] = a(4,1)*a(5,3)-a(4,3)*a(5,1);
3664 w[ 7] = a(4,1)*a(5,4)-a(4,4)*a(5,1);
3665 w[ 8] = a(4,1)*a(5,5)-a(4,5)*a(5,1);
3666 w[ 9] = a(4,2)*a(5,3)-a(4,3)*a(5,2);
3667 w[10] = a(4,2)*a(5,4)-a(4,4)*a(5,2);
3668 w[11] = a(4,2)*a(5,5)-a(4,5)*a(5,2);
3669 w[12] = a(4,3)*a(5,4)-a(4,4)*a(5,3);
3670 w[13] = a(4,3)*a(5,5)-a(4,5)*a(5,3);
3671 w[14] = a(4,4)*a(5,5)-a(4,5)*a(5,4);
3672 w[15] = a(3,0)*w[5]-a(3,1)*w[1]+a(3,2)*w[0];
3673 w[16] = a(3,0)*w[6]-a(3,1)*w[2]+a(3,3)*w[0];
3674 w[17] = a(3,0)*w[7]-a(3,1)*w[3]+a(3,4)*w[0];
3675 w[18] = a(3,0)*w[8]-a(3,1)*w[4]+a(3,5)*w[0];
3676 w[19] = a(3,0)*w[9]-a(3,2)*w[2]+a(3,3)*w[1];
3677 w[20] = a(3,0)*w[10]-a(3,2)*w[3]+a(3,4)*w[1];
3678 w[21] = a(3,0)*w[11]-a(3,2)*w[4]+a(3,5)*w[1];
3679 w[22] = a(3,0)*w[12]-a(3,3)*w[3]+a(3,4)*w[2];
3680 w[23] = a(3,0)*w[13]-a(3,3)*w[4]+a(3,5)*w[2];
3681 w[24] = a(3,0)*w[14]-a(3,4)*w[4]+a(3,5)*w[3];
3682 w[25] = a(3,1)*w[9]-a(3,2)*w[6]+a(3,3)*w[5];
3683 w[26] = a(3,1)*w[10]-a(3,2)*w[7]+a(3,4)*w[5];
3684 w[27] = a(3,1)*w[11]-a(3,2)*w[8]+a(3,5)*w[5];
3685 w[28] = a(3,1)*w[12]-a(3,3)*w[7]+a(3,4)*w[6];
3686 w[29] = a(3,1)*w[13]-a(3,3)*w[8]+a(3,5)*w[6];
3687 w[30] = a(3,1)*w[14]-a(3,4)*w[8]+a(3,5)*w[7];
3688 w[31] = a(3,2)*w[12]-a(3,3)*w[10]+a(3,4)*w[9];
3689 w[32] = a(3,2)*w[13]-a(3,3)*w[11]+a(3,5)*w[9];
3690 w[33] = a(3,2)*w[14]-a(3,4)*w[11]+a(3,5)*w[10];
3691 w[34] = a(3,3)*w[14]-a(3,4)*w[13]+a(3,5)*w[12];
3692 w[ 0] = a(2,0)*w[25]-a(2,1)*w[19]+a(2,2)*w[16]-a(2,3)*w[15];
3693 w[ 1] = a(2,0)*w[26]-a(2,1)*w[20]+a(2,2)*w[17]-a(2,4)*w[15];
3694 w[ 2] = a(2,0)*w[27]-a(2,1)*w[21]+a(2,2)*w[18]-a(2,5)*w[15];
3695 w[ 3] = a(2,0)*w[28]-a(2,1)*w[22]+a(2,3)*w[17]-a(2,4)*w[16];
3696 w[ 4] = a(2,0)*w[29]-a(2,1)*w[23]+a(2,3)*w[18]-a(2,5)*w[16];
3697 w[ 5] = a(2,0)*w[30]-a(2,1)*w[24]+a(2,4)*w[18]-a(2,5)*w[17];
3698 w[ 6] = a(2,0)*w[31]-a(2,2)*w[22]+a(2,3)*w[20]-a(2,4)*w[19];
3699 w[ 7] = a(2,0)*w[32]-a(2,2)*w[23]+a(2,3)*w[21]-a(2,5)*w[19];
3700 w[ 8] = a(2,0)*w[33]-a(2,2)*w[24]+a(2,4)*w[21]-a(2,5)*w[20];
3701 w[ 9] = a(2,0)*w[34]-a(2,3)*w[24]+a(2,4)*w[23]-a(2,5)*w[22];
3702 w[10] = a(2,1)*w[31]-a(2,2)*w[28]+a(2,3)*w[26]-a(2,4)*w[25];
3703 w[11] = a(2,1)*w[32]-a(2,2)*w[29]+a(2,3)*w[27]-a(2,5)*w[25];
3704 w[12] = a(2,1)*w[33]-a(2,2)*w[30]+a(2,4)*w[27]-a(2,5)*w[26];
3705 w[13] = a(2,1)*w[34]-a(2,3)*w[30]+a(2,4)*w[29]-a(2,5)*w[28];
3706 w[14] = a(2,2)*w[34]-a(2,3)*w[33]+a(2,4)*w[32]-a(2,5)*w[31];
3707 b(0,0) = a(1,1)*w[14]-a(1,2)*w[13]+a(1,3)*w[12]-a(1,4)*w[11]+a(1,5)*w[10];
3708 b(1,0) = -a(1,0)*w[14]+a(1,2)*w[9]-a(1,3)*w[8]+a(1,4)*w[7]-a(1,5)*w[6];
3709 b(2,0) = a(1,0)*w[13]-a(1,1)*w[9]+a(1,3)*w[5]-a(1,4)*w[4]+a(1,5)*w[3];
3710 b(3,0) = -a(1,0)*w[12]+a(1,1)*w[8]-a(1,2)*w[5]+a(1,4)*w[2]-a(1,5)*w[1];
3711 b(4,0) = a(1,0)*w[11]-a(1,1)*w[7]+a(1,2)*w[4]-a(1,3)*w[2]+a(1,5)*w[0];
3712 b(5,0) = -a(1,0)*w[10]+a(1,1)*w[6]-a(1,2)*w[3]+a(1,3)*w[1]-a(1,4)*w[0];
3713 T_ det = a(0,0)*b(0,0) + a(0,1)*b(1,0) + a(0,2)*b(2,0)
3714 + a(0,3)*b(3,0) + a(0,4)*b(4,0) + a(0,5)*b(5,0);
3715 T_ d = T_(1) / det;
3716 b(0,0) *= d;
3717 b(1,0) *= d;
3718 b(2,0) *= d;
3719 b(3,0) *= d;
3720 b(4,0) *= d;
3721 b(5,0) *= d;
3722 b(0,1) = d*(-a(0,1)*w[14]+a(0,2)*w[13]-a(0,3)*w[12]+a(0,4)*w[11]-a(0,5)*w[10]);
3723 b(1,1) = d*( a(0,0)*w[14]-a(0,2)*w[9]+a(0,3)*w[8]-a(0,4)*w[7]+a(0,5)*w[6]);
3724 b(2,1) = d*(-a(0,0)*w[13]+a(0,1)*w[9]-a(0,3)*w[5]+a(0,4)*w[4]-a(0,5)*w[3]);
3725 b(3,1) = d*( a(0,0)*w[12]-a(0,1)*w[8]+a(0,2)*w[5]-a(0,4)*w[2]+a(0,5)*w[1]);
3726 b(4,1) = d*(-a(0,0)*w[11]+a(0,1)*w[7]-a(0,2)*w[4]+a(0,3)*w[2]-a(0,5)*w[0]);
3727 b(5,1) = d*( a(0,0)*w[10]-a(0,1)*w[6]+a(0,2)*w[3]-a(0,3)*w[1]+a(0,4)*w[0]);
3728 w[ 0] = a(4,0)*a(5,1)-a(4,1)*a(5,0);
3729 w[ 1] = a(4,0)*a(5,2)-a(4,2)*a(5,0);
3730 w[ 2] = a(4,0)*a(5,3)-a(4,3)*a(5,0);
3731 w[ 3] = a(4,0)*a(5,4)-a(4,4)*a(5,0);
3732 w[ 4] = a(4,0)*a(5,5)-a(4,5)*a(5,0);
3733 w[ 5] = a(4,1)*a(5,2)-a(4,2)*a(5,1);
3734 w[ 6] = a(4,1)*a(5,3)-a(4,3)*a(5,1);
3735 w[ 7] = a(4,1)*a(5,4)-a(4,4)*a(5,1);
3736 w[ 8] = a(4,1)*a(5,5)-a(4,5)*a(5,1);
3737 w[ 9] = a(4,2)*a(5,3)-a(4,3)*a(5,2);
3738 w[10] = a(4,2)*a(5,4)-a(4,4)*a(5,2);
3739 w[11] = a(4,2)*a(5,5)-a(4,5)*a(5,2);
3740 w[12] = a(4,3)*a(5,4)-a(4,4)*a(5,3);
3741 w[13] = a(4,3)*a(5,5)-a(4,5)*a(5,3);
3742 w[14] = a(4,4)*a(5,5)-a(4,5)*a(5,4);
3743 w[15] = a(1,0)*w[5]-a(1,1)*w[1]+a(1,2)*w[0];
3744 w[16] = a(1,0)*w[6]-a(1,1)*w[2]+a(1,3)*w[0];
3745 w[17] = a(1,0)*w[7]-a(1,1)*w[3]+a(1,4)*w[0];
3746 w[18] = a(1,0)*w[8]-a(1,1)*w[4]+a(1,5)*w[0];
3747 w[19] = a(1,0)*w[9]-a(1,2)*w[2]+a(1,3)*w[1];
3748 w[20] = a(1,0)*w[10]-a(1,2)*w[3]+a(1,4)*w[1];
3749 w[21] = a(1,0)*w[11]-a(1,2)*w[4]+a(1,5)*w[1];
3750 w[22] = a(1,0)*w[12]-a(1,3)*w[3]+a(1,4)*w[2];
3751 w[23] = a(1,0)*w[13]-a(1,3)*w[4]+a(1,5)*w[2];
3752 w[24] = a(1,0)*w[14]-a(1,4)*w[4]+a(1,5)*w[3];
3753 w[25] = a(1,1)*w[9]-a(1,2)*w[6]+a(1,3)*w[5];
3754 w[26] = a(1,1)*w[10]-a(1,2)*w[7]+a(1,4)*w[5];
3755 w[27] = a(1,1)*w[11]-a(1,2)*w[8]+a(1,5)*w[5];
3756 w[28] = a(1,1)*w[12]-a(1,3)*w[7]+a(1,4)*w[6];
3757 w[29] = a(1,1)*w[13]-a(1,3)*w[8]+a(1,5)*w[6];
3758 w[30] = a(1,1)*w[14]-a(1,4)*w[8]+a(1,5)*w[7];
3759 w[31] = a(1,2)*w[12]-a(1,3)*w[10]+a(1,4)*w[9];
3760 w[32] = a(1,2)*w[13]-a(1,3)*w[11]+a(1,5)*w[9];
3761 w[33] = a(1,2)*w[14]-a(1,4)*w[11]+a(1,5)*w[10];
3762 w[34] = a(1,3)*w[14]-a(1,4)*w[13]+a(1,5)*w[12];
3763 w[ 0] = a(0,0)*w[25]-a(0,1)*w[19]+a(0,2)*w[16]-a(0,3)*w[15];
3764 w[ 1] = a(0,0)*w[26]-a(0,1)*w[20]+a(0,2)*w[17]-a(0,4)*w[15];
3765 w[ 2] = a(0,0)*w[27]-a(0,1)*w[21]+a(0,2)*w[18]-a(0,5)*w[15];
3766 w[ 3] = a(0,0)*w[28]-a(0,1)*w[22]+a(0,3)*w[17]-a(0,4)*w[16];
3767 w[ 4] = a(0,0)*w[29]-a(0,1)*w[23]+a(0,3)*w[18]-a(0,5)*w[16];
3768 w[ 5] = a(0,0)*w[30]-a(0,1)*w[24]+a(0,4)*w[18]-a(0,5)*w[17];
3769 w[ 6] = a(0,0)*w[31]-a(0,2)*w[22]+a(0,3)*w[20]-a(0,4)*w[19];
3770 w[ 7] = a(0,0)*w[32]-a(0,2)*w[23]+a(0,3)*w[21]-a(0,5)*w[19];
3771 w[ 8] = a(0,0)*w[33]-a(0,2)*w[24]+a(0,4)*w[21]-a(0,5)*w[20];
3772 w[ 9] = a(0,0)*w[34]-a(0,3)*w[24]+a(0,4)*w[23]-a(0,5)*w[22];
3773 w[10] = a(0,1)*w[31]-a(0,2)*w[28]+a(0,3)*w[26]-a(0,4)*w[25];
3774 w[11] = a(0,1)*w[32]-a(0,2)*w[29]+a(0,3)*w[27]-a(0,5)*w[25];
3775 w[12] = a(0,1)*w[33]-a(0,2)*w[30]+a(0,4)*w[27]-a(0,5)*w[26];
3776 w[13] = a(0,1)*w[34]-a(0,3)*w[30]+a(0,4)*w[29]-a(0,5)*w[28];
3777 w[14] = a(0,2)*w[34]-a(0,3)*w[33]+a(0,4)*w[32]-a(0,5)*w[31];
3778 b(0,2) = d*( a(3,1)*w[14]-a(3,2)*w[13]+a(3,3)*w[12]-a(3,4)*w[11]+a(3,5)*w[10]);
3779 b(1,2) = d*(-a(3,0)*w[14]+a(3,2)*w[9]-a(3,3)*w[8]+a(3,4)*w[7]-a(3,5)*w[6]);
3780 b(2,2) = d*( a(3,0)*w[13]-a(3,1)*w[9]+a(3,3)*w[5]-a(3,4)*w[4]+a(3,5)*w[3]);
3781 b(3,2) = d*(-a(3,0)*w[12]+a(3,1)*w[8]-a(3,2)*w[5]+a(3,4)*w[2]-a(3,5)*w[1]);
3782 b(4,2) = d*( a(3,0)*w[11]-a(3,1)*w[7]+a(3,2)*w[4]-a(3,3)*w[2]+a(3,5)*w[0]);
3783 b(5,2) = d*(-a(3,0)*w[10]+a(3,1)*w[6]-a(3,2)*w[3]+a(3,3)*w[1]-a(3,4)*w[0]);
3784 b(0,3) = d*(-a(2,1)*w[14]+a(2,2)*w[13]-a(2,3)*w[12]+a(2,4)*w[11]-a(2,5)*w[10]);
3785 b(1,3) = d*( a(2,0)*w[14]-a(2,2)*w[9]+a(2,3)*w[8]-a(2,4)*w[7]+a(2,5)*w[6]);
3786 b(2,3) = d*(-a(2,0)*w[13]+a(2,1)*w[9]-a(2,3)*w[5]+a(2,4)*w[4]-a(2,5)*w[3]);
3787 b(3,3) = d*( a(2,0)*w[12]-a(2,1)*w[8]+a(2,2)*w[5]-a(2,4)*w[2]+a(2,5)*w[1]);
3788 b(4,3) = d*(-a(2,0)*w[11]+a(2,1)*w[7]-a(2,2)*w[4]+a(2,3)*w[2]-a(2,5)*w[0]);
3789 b(5,3) = d*( a(2,0)*w[10]-a(2,1)*w[6]+a(2,2)*w[3]-a(2,3)*w[1]+a(2,4)*w[0]);
3790 w[ 0] = a(2,0)*a(3,1)-a(2,1)*a(3,0);
3791 w[ 1] = a(2,0)*a(3,2)-a(2,2)*a(3,0);
3792 w[ 2] = a(2,0)*a(3,3)-a(2,3)*a(3,0);
3793 w[ 3] = a(2,0)*a(3,4)-a(2,4)*a(3,0);
3794 w[ 4] = a(2,0)*a(3,5)-a(2,5)*a(3,0);
3795 w[ 5] = a(2,1)*a(3,2)-a(2,2)*a(3,1);
3796 w[ 6] = a(2,1)*a(3,3)-a(2,3)*a(3,1);
3797 w[ 7] = a(2,1)*a(3,4)-a(2,4)*a(3,1);
3798 w[ 8] = a(2,1)*a(3,5)-a(2,5)*a(3,1);
3799 w[ 9] = a(2,2)*a(3,3)-a(2,3)*a(3,2);
3800 w[10] = a(2,2)*a(3,4)-a(2,4)*a(3,2);
3801 w[11] = a(2,2)*a(3,5)-a(2,5)*a(3,2);
3802 w[12] = a(2,3)*a(3,4)-a(2,4)*a(3,3);
3803 w[13] = a(2,3)*a(3,5)-a(2,5)*a(3,3);
3804 w[14] = a(2,4)*a(3,5)-a(2,5)*a(3,4);
3805 w[15] = a(1,0)*w[5]-a(1,1)*w[1]+a(1,2)*w[0];
3806 w[16] = a(1,0)*w[6]-a(1,1)*w[2]+a(1,3)*w[0];
3807 w[17] = a(1,0)*w[7]-a(1,1)*w[3]+a(1,4)*w[0];
3808 w[18] = a(1,0)*w[8]-a(1,1)*w[4]+a(1,5)*w[0];
3809 w[19] = a(1,0)*w[9]-a(1,2)*w[2]+a(1,3)*w[1];
3810 w[20] = a(1,0)*w[10]-a(1,2)*w[3]+a(1,4)*w[1];
3811 w[21] = a(1,0)*w[11]-a(1,2)*w[4]+a(1,5)*w[1];
3812 w[22] = a(1,0)*w[12]-a(1,3)*w[3]+a(1,4)*w[2];
3813 w[23] = a(1,0)*w[13]-a(1,3)*w[4]+a(1,5)*w[2];
3814 w[24] = a(1,0)*w[14]-a(1,4)*w[4]+a(1,5)*w[3];
3815 w[25] = a(1,1)*w[9]-a(1,2)*w[6]+a(1,3)*w[5];
3816 w[26] = a(1,1)*w[10]-a(1,2)*w[7]+a(1,4)*w[5];
3817 w[27] = a(1,1)*w[11]-a(1,2)*w[8]+a(1,5)*w[5];
3818 w[28] = a(1,1)*w[12]-a(1,3)*w[7]+a(1,4)*w[6];
3819 w[29] = a(1,1)*w[13]-a(1,3)*w[8]+a(1,5)*w[6];
3820 w[30] = a(1,1)*w[14]-a(1,4)*w[8]+a(1,5)*w[7];
3821 w[31] = a(1,2)*w[12]-a(1,3)*w[10]+a(1,4)*w[9];
3822 w[32] = a(1,2)*w[13]-a(1,3)*w[11]+a(1,5)*w[9];
3823 w[33] = a(1,2)*w[14]-a(1,4)*w[11]+a(1,5)*w[10];
3824 w[34] = a(1,3)*w[14]-a(1,4)*w[13]+a(1,5)*w[12];
3825 w[ 0] = a(0,0)*w[25]-a(0,1)*w[19]+a(0,2)*w[16]-a(0,3)*w[15];
3826 w[ 1] = a(0,0)*w[26]-a(0,1)*w[20]+a(0,2)*w[17]-a(0,4)*w[15];
3827 w[ 2] = a(0,0)*w[27]-a(0,1)*w[21]+a(0,2)*w[18]-a(0,5)*w[15];
3828 w[ 3] = a(0,0)*w[28]-a(0,1)*w[22]+a(0,3)*w[17]-a(0,4)*w[16];
3829 w[ 4] = a(0,0)*w[29]-a(0,1)*w[23]+a(0,3)*w[18]-a(0,5)*w[16];
3830 w[ 5] = a(0,0)*w[30]-a(0,1)*w[24]+a(0,4)*w[18]-a(0,5)*w[17];
3831 w[ 6] = a(0,0)*w[31]-a(0,2)*w[22]+a(0,3)*w[20]-a(0,4)*w[19];
3832 w[ 7] = a(0,0)*w[32]-a(0,2)*w[23]+a(0,3)*w[21]-a(0,5)*w[19];
3833 w[ 8] = a(0,0)*w[33]-a(0,2)*w[24]+a(0,4)*w[21]-a(0,5)*w[20];
3834 w[ 9] = a(0,0)*w[34]-a(0,3)*w[24]+a(0,4)*w[23]-a(0,5)*w[22];
3835 w[10] = a(0,1)*w[31]-a(0,2)*w[28]+a(0,3)*w[26]-a(0,4)*w[25];
3836 w[11] = a(0,1)*w[32]-a(0,2)*w[29]+a(0,3)*w[27]-a(0,5)*w[25];
3837 w[12] = a(0,1)*w[33]-a(0,2)*w[30]+a(0,4)*w[27]-a(0,5)*w[26];
3838 w[13] = a(0,1)*w[34]-a(0,3)*w[30]+a(0,4)*w[29]-a(0,5)*w[28];
3839 w[14] = a(0,2)*w[34]-a(0,3)*w[33]+a(0,4)*w[32]-a(0,5)*w[31];
3840 b(0,4) = d*( a(5,1)*w[14]-a(5,2)*w[13]+a(5,3)*w[12]-a(5,4)*w[11]+a(5,5)*w[10]);
3841 b(1,4) = d*(-a(5,0)*w[14]+a(5,2)*w[9]-a(5,3)*w[8]+a(5,4)*w[7]-a(5,5)*w[6]);
3842 b(2,4) = d*( a(5,0)*w[13]-a(5,1)*w[9]+a(5,3)*w[5]-a(5,4)*w[4]+a(5,5)*w[3]);
3843 b(3,4) = d*(-a(5,0)*w[12]+a(5,1)*w[8]-a(5,2)*w[5]+a(5,4)*w[2]-a(5,5)*w[1]);
3844 b(4,4) = d*( a(5,0)*w[11]-a(5,1)*w[7]+a(5,2)*w[4]-a(5,3)*w[2]+a(5,5)*w[0]);
3845 b(5,4) = d*(-a(5,0)*w[10]+a(5,1)*w[6]-a(5,2)*w[3]+a(5,3)*w[1]-a(5,4)*w[0]);
3846 b(0,5) = d*(-a(4,1)*w[14]+a(4,2)*w[13]-a(4,3)*w[12]+a(4,4)*w[11]-a(4,5)*w[10]);
3847 b(1,5) = d*( a(4,0)*w[14]-a(4,2)*w[9]+a(4,3)*w[8]-a(4,4)*w[7]+a(4,5)*w[6]);
3848 b(2,5) = d*(-a(4,0)*w[13]+a(4,1)*w[9]-a(4,3)*w[5]+a(4,4)*w[4]-a(4,5)*w[3]);
3849 b(3,5) = d*( a(4,0)*w[12]-a(4,1)*w[8]+a(4,2)*w[5]-a(4,4)*w[2]+a(4,5)*w[1]);
3850 b(4,5) = d*(-a(4,0)*w[11]+a(4,1)*w[7]-a(4,2)*w[4]+a(4,3)*w[2]-a(4,5)*w[0]);
3851 b(5,5) = d*( a(4,0)*w[10]-a(4,1)*w[6]+a(4,2)*w[3]-a(4,3)*w[1]+a(4,4)*w[0]);
3852 return det;
3853 }
3854 };
3855
3856 // generic square matrix inversion:
3857 template<int n_>
3858 struct InverseHelper<n_, n_>
3859 {
3860 template<typename T_, int smb_, int snb_, int sma_, int sna_>
3861 CUDA_HOST_DEVICE static T_ compute(Tiny::Matrix<T_, n_, n_, smb_, snb_>& b, const Tiny::Matrix<T_, n_, n_, sma_, sna_>& a)
3862 {
3863 // copy matrix a to b
3864 for (int i(0); i < n_; ++i)
3865 {
3866 for (int j(0); j < n_; ++j)
3867 {
3868 b(i,j) = a(i,j);
3869 }
3870 }
3871
3872 // create pivot array
3873 int p[n_];
3874
3875 // initialize identity permutation
3876 for(int i(0); i < n_; ++i)
3877 {
3878 p[i] = i;
3879 }
3880
3881 // initialize determinant to 1
3882 T_ det = T_(1);
3883
3884 // primary column elimination loop
3885 for(int k(0); k < n_; ++k)
3886 {
3887 // step 1: find a pivot for the elimination of column k
3888 {
3889 // for this, we only check the rows p[j] with j >= k, as all
3890 // rows p[j] with j < k have already been eliminated and are
3891 // therefore not candidates for pivoting
3892 #ifdef __CUDACC__
3893 T_ pivot = CudaMath::cuda_abs(b(p[k], p[k]));
3894 #else
3895 T_ pivot = Math::abs(b(p[k], p[k]));
3896 #endif
3897 int i = k;
3898
3899 // loop over all unprocessed rows
3900 for(int j(k+1); j < n_; ++j)
3901 {
3902 // get our matrix value and check whether it can be a pivot
3903 #ifdef __CUDACC__
3904 T_ abs_val = CudaMath::cuda_abs(b(p[j], p[j]));
3905 #else
3906 T_ abs_val = Math::abs(b(p[j], p[j]));
3907 #endif
3908 if(abs_val > pivot)
3909 {
3910 pivot = abs_val;
3911 i = j;
3912 }
3913 }
3914
3915 // do we have to swap rows i and k?
3916 if(i > k)
3917 {
3918 // swap rows "virtually" by exchanging their permutation positions
3919 int t = p[k];
3920 p[k] = p[i];
3921 p[i] = t;
3922 }
3923 }
3924
3925 // step 2: process pivot row
3926 {
3927 // update determinant by multiplying with the pivot element
3928 det *= b(p[k], p[k]);
3929
3930 // get our inverted pivot element
3931 const T_ pivot = T_(1) / b(p[k], p[k]);
3932
3933 // replace column entry by unit column entry
3934 b(p[k], p[k]) = T_(1);
3935
3936 // divide the whole row by the inverted pivot
3937 for(int j(0); j < n_; ++j)
3938 {
3939 b(p[k], j) *= pivot;
3940 }
3941 }
3942
3943 // step 3: eliminate pivot column
3944
3945 // loop over all rows of the matrix
3946 for(int i(0); i < n_; ++i)
3947 {
3948 // skip the pivot row
3949 if(i == p[k])
3950 continue;
3951
3952 // fetch elimination factor
3953 const T_ factor = b(i, p[k]);
3954
3955 // replace by unit column entry
3956 b(i, p[k]) = T_(0);
3957
3958 // process the row
3959 for(int j(0); j < n_; ++j)
3960 {
3961 b(i, j) -= b(p[k], j) * factor;
3962 }
3963 }
3964 }
3965
3966 // return determinant
3967 return det;
3968 }
3969 };
3970
3976
3977 #ifdef __CUDACC__
3978 template<>
3979 struct CudaGroupedInverseHelper<1,1>
3980 {
3981 template<typename T_, typename ThreadGroup_, int smb_, int snb_, int sma_, int sna_>
3982 CUDA_DEVICE static __forceinline__ void compute(const ThreadGroup_& tg, Tiny::Matrix<T_, 1, 1, smb_, snb_>& b, const Tiny::Matrix<T_, 1, 1, sma_, sna_>&, const T_& det)
3983 {
3984 if(tg.thread_rank() == 0)
3985 b(0,0) = T_(1) / det;
3986 }
3987 };
3988
3989 template<>
3990 struct CudaGroupedInverseHelper<2,2>
3991 {
3992 template<typename T_, typename ThreadGroup_, int smb_, int snb_, int sma_, int sna_>
3993 CUDA_DEVICE static __forceinline__ void compute(const ThreadGroup_& tg, Tiny::Matrix<T_, 2, 2, smb_, snb_>& b, const Tiny::Matrix<T_, 2, 2, sma_, sna_>& a, const T_& det)
3994 {
3995 //i think an if else cascade could do better than heavy modulo operations...
3996 T_ d = T_(1) / det;
3997
3998 for(int idx = tg.thread_rank(); idx < 4; idx += tg.num_threads())
3999 {
4000 const int i = idx /2;
4001 const int j = idx %2;
4002 b(i,0) = ((i+j)==1 ? T_(-1) : T_(1)) * d * a(1-j,1-i);
4003
4004 }
4005 b(0,0) = d*a(1,1);
4006 b(0,1) = -d*a(0,1);
4007 b(1,0) = -d*a(1,0);
4008 b(1,1) = d*a(0,0);
4009 }
4010 };
4011
4012 template<>
4013 struct CudaGroupedInverseHelper<3,3>
4014 {
4015 template<typename T_, typename ThreadGroup_, int smb_, int snb_, int sma_, int sna_>
4016 CUDA_DEVICE static __forceinline__ void compute(const ThreadGroup_& tg, Tiny::Matrix<T_, 3, 3, smb_, snb_>& b, const Tiny::Matrix<T_, 3, 3, sma_, sna_>& a, const T_& det)
4017 {
4018 // for(int i = tg.thread_rank(); i < 3; i += tg.num_threads())
4019 // {
4020 // b[i*snb_+0] = a[1*sna_+1+i-3*(i/2)]*a[2*sna_+(2*(1-i+i/2))] - a[1*sna_+(2*(1-i+i/2))]*a[2*sna_+1+i-3*(i/2)];
4021 // }
4022 // tg.sync();
4023 // T_ det = a[0*sna_+0]*b[0*snb_+0] + a[0*sna_+1]*b[1*snb_+0] + a[0*sna_+2]*b[2*snb_+0];
4024 // T_ d = T_(1) / det;
4025 // for(int idx = tg.thread_rank(); idx < 3; idx += tg.num_threads())
4026 // {
4027 // const int i = idx/3;
4028 // const int j = idx%3;
4029 // b[i*snb_+j] = d*(a[()*sna_+1]*a[2*sna_+2] - a[1*sna_+2]*a[2*sna_+1])
4030
4031 // }
4032 //i think an if else cascade could do better than heavy modulo operations...
4033 T_ d = T_(1) / det;
4034
4035 for(int idx = tg.thread_rank(); idx < 9; idx += tg.num_threads())
4036 {
4037 if(idx == 0)
4038 b(0,0) = d*(a(1,1)*a(2,2) - a(1,2)*a(2,1));
4039 else if(idx == 3)
4040 b(1,0) = d*(a(1,2)*a(2,0) - a(1,0)*a(2,2));
4041 else if(idx == 6)
4042 b(2,0) = d*(a(1,0)*a(2,1) - a(1,1)*a(2,0));
4043 else if(idx == 1)
4044 b(0,1) = d*(a(0,2)*a(2,1) - a(0,1)*a(2,2));
4045 else if(idx == 4)
4046 b(1,1) = d*(a(0,0)*a(2,2) - a(0,2)*a(2,0));
4047 else if(idx == 7)
4048 b(2,1) = d*(a(0,1)*a(2,0) - a(0,0)*a(2,1));
4049 else if(idx == 2)
4050 b(0,2) = d*(a(0,1)*a(1,2) - a(0,2)*a(1,1));
4051 else if(idx == 5)
4052 b(1,2) = d*(a(0,2)*a(1,0) - a(0,0)*a(1,2));
4053 else if(idx == 8)
4054 b(2,2) = d*(a(0,0)*a(1,1) - a(0,1)*a(1,0));
4055 }
4056 }
4057 };
4058
4059 // generic square matrix inversion:
4060 template<int n_>
4061 struct CudaGroupedInverseHelper<n_, n_>
4062 {
4063 template<typename T_, typename ThreadGroup_, int smb_, int snb_, int sma_, int sna_>
4064 CUDA_DEVICE static __forceinline__ void compute(const ThreadGroup_& tg, Tiny::Matrix<T_, n_, n_, smb_, snb_>& b, const Tiny::Matrix<T_, n_, n_, sma_, sna_>& a, const T_&)
4065 {
4066 // copy matrix a to b
4067 for (int i = tg.thread_rank(); i < n_*n_; i += tg.num_threads())
4068 {
4069 const int row = i/n_;
4070 const int col = i%n_;
4071 b.v[row][col] = a.v[row][col];
4072 }
4073
4074 // create shared pivot array
4075 __shared__ int p[n_];
4076 for (int i = tg.thread_rank(); i < n_; i += tg.num_threads())
4077 {
4078 p[i] = 0;
4079 }
4080 tg.sync();
4081 // call grouped invert from first warp subgroup
4082 if(tg.thread_rank() < 32)
4083 {
4084 auto a_g = cg::coalesced_threads();
4085
4086 CudaMath::cuda_grouped_invert_matrix(a_g, n_, snb_, &b.v[0][0], p);
4087 }
4088 tg.sync();
4089 }
4090 };
4091 #endif // __CUDACC__
4092
4098
4099 template<int m_, int n_>
4100 struct CofactorHelper
4101 {
4102 template<typename T_, int smb_, int snb_, int sma_, int sna_>
4103 CUDA_HOST_DEVICE static void compute(Tiny::Matrix<T_, m_, n_, smb_, snb_>& b, const Tiny::Matrix<T_, m_, n_, sma_, sna_>& a)
4104 {
4105 static_assert(m_ == n_, "cofactor computation is only available for square matrices!");
4106
4107 // compute inverse
4108 const T_ det = Intern::InverseHelper<m_,n_>::compute(b, a);
4109
4110 for(int i(0); i < m_; ++i)
4111 {
4112 for(int j(0); j <= i; ++j)
4113 {
4114 b(i,j) *= det;
4115 }
4116 for(int j(i+1); j < n_; ++j)
4117 {
4118 std::swap(b(i,j), b(j,i));
4119 b(i,j) *= det;
4120 }
4121 }
4122 }
4123 };
4124
4125 template<>
4126 struct CofactorHelper<1,1>
4127 {
4128 template<typename T_, int smb_, int snb_, int sma_, int sna_>
4129 CUDA_HOST_DEVICE static void compute(Tiny::Matrix<T_, 1, 1, smb_, snb_>& b, const Tiny::Matrix<T_, 1, 1, sma_, sna_>&)
4130 {
4131 b[0] = T_(1);
4132 }
4133 };
4134
4135 template<>
4136 struct CofactorHelper<2,2>
4137 {
4138 template<typename T_, int smb_, int snb_, int sma_, int sna_>
4139 CUDA_HOST_DEVICE static void compute(Tiny::Matrix<T_, 2, 2, smb_, snb_>& b, const Tiny::Matrix<T_, 2, 2, sma_, sna_>& a)
4140 {
4141 b(0,0) = a(1,1);
4142 b(0,1) = -a(0,1);
4143 b(1,0) = -a(1,0);
4144 b(1,1) = a(0,0);
4145 }
4146 };
4147
4148 template<>
4149 struct CofactorHelper<3,3>
4150 {
4151 template<typename T_, int smb_, int snb_, int sma_, int sna_>
4152 CUDA_HOST_DEVICE static void compute(Tiny::Matrix<T_, 3, 3, smb_, snb_>& b, const Tiny::Matrix<T_, 3, 3, sma_, sna_>& a)
4153 {
4154 b(0,0) = a(1,1)*a(2,2) - a(1,2)*a(2,1);
4155 b(1,0) = a(1,2)*a(2,0) - a(1,0)*a(2,2);
4156 b(2,0) = a(1,0)*a(2,1) - a(1,1)*a(2,0);
4157 b(0,1) = a(0,2)*a(2,1) - a(0,1)*a(2,2);
4158 b(1,1) = a(0,0)*a(2,2) - a(0,2)*a(2,0);
4159 b(2,1) = a(0,1)*a(2,0) - a(0,0)*a(2,1);
4160 b(0,2) = a(0,1)*a(1,2) - a(0,2)*a(1,1);
4161 b(1,2) = a(0,2)*a(1,0) - a(0,0)*a(1,2);
4162 b(2,2) = a(0,0)*a(1,1) - a(0,1)*a(1,0);
4163 }
4164 };
4165
4166 template<>
4167 struct CofactorHelper<4,4>
4168 {
4169 template<typename T_, int smb_, int snb_, int sma_, int sna_>
4170 CUDA_HOST_DEVICE static void compute(Tiny::Matrix<T_, 4, 4, smb_, snb_>& b, const Tiny::Matrix<T_, 4, 4, sma_, sna_>& a)
4171 {
4172 T_ w[6];
4173 w[0] = a(2,0)*a(3,1)-a(2,1)*a(3,0);
4174 w[1] = a(2,0)*a(3,2)-a(2,2)*a(3,0);
4175 w[2] = a(2,0)*a(3,3)-a(2,3)*a(3,0);
4176 w[3] = a(2,1)*a(3,2)-a(2,2)*a(3,1);
4177 w[4] = a(2,1)*a(3,3)-a(2,3)*a(3,1);
4178 w[5] = a(2,2)*a(3,3)-a(2,3)*a(3,2);
4179
4180 b(0,0) = a(1,1)*w[5]-a(1,2)*w[4]+a(1,3)*w[3];
4181 b(1,0) =-a(1,0)*w[5]+a(1,2)*w[2]-a(1,3)*w[1];
4182 b(2,0) = a(1,0)*w[4]-a(1,1)*w[2]+a(1,3)*w[0];
4183 b(3,0) =-a(1,0)*w[3]+a(1,1)*w[1]-a(1,2)*w[0];
4184
4185 b(0,1) =-a(0,1)*w[5]+a(0,2)*w[4]-a(0,3)*w[3];
4186 b(1,1) = a(0,0)*w[5]-a(0,2)*w[2]+a(0,3)*w[1];
4187 b(2,1) =-a(0,0)*w[4]+a(0,1)*w[2]-a(0,3)*w[0];
4188 b(3,1) = a(0,0)*w[3]-a(0,1)*w[1]+a(0,2)*w[0];
4189
4190 w[0] = a(0,0)*a(1,1)-a(0,1)*a(1,0);
4191 w[1] = a(0,0)*a(1,2)-a(0,2)*a(1,0);
4192 w[2] = a(0,0)*a(1,3)-a(0,3)*a(1,0);
4193 w[3] = a(0,1)*a(1,2)-a(0,2)*a(1,1);
4194 w[4] = a(0,1)*a(1,3)-a(0,3)*a(1,1);
4195 w[5] = a(0,2)*a(1,3)-a(0,3)*a(1,2);
4196
4197 b(0,2) = a(3,1)*w[5]-a(3,2)*w[4]+a(3,3)*w[3];
4198 b(1,2) =-a(3,0)*w[5]+a(3,2)*w[2]-a(3,3)*w[1];
4199 b(2,2) = a(3,0)*w[4]-a(3,1)*w[2]+a(3,3)*w[0];
4200 b(3,2) =-a(3,0)*w[3]+a(3,1)*w[1]-a(3,2)*w[0];
4201
4202 b(0,3) =-a(2,1)*w[5]+a(2,2)*w[4]-a(2,3)*w[3];
4203 b(1,3) = a(2,0)*w[5]-a(2,2)*w[2]+a(2,3)*w[1];
4204 b(2,3) =-a(2,0)*w[4]+a(2,1)*w[2]-a(2,3)*w[0];
4205 b(3,3) = a(2,0)*w[3]-a(2,1)*w[1]+a(2,2)*w[0];
4206 }
4207 };
4208
4209 template<>
4210 struct CofactorHelper<5,5>
4211 {
4212 template<typename T_, int smb_, int snb_, int sma_, int sna_>
4213 CUDA_HOST_DEVICE static void compute(Tiny::Matrix<T_, 5, 5, smb_, snb_>& b, const Tiny::Matrix<T_, 5, 5, sma_, sna_>& a)
4214 {
4215 T_ w[20];
4216 w[ 0] = a(3,0)*a(4,1)-a(3,1)*a(4,0);
4217 w[ 1] = a(3,0)*a(4,2)-a(3,2)*a(4,0);
4218 w[ 2] = a(3,0)*a(4,3)-a(3,3)*a(4,0);
4219 w[ 3] = a(3,0)*a(4,4)-a(3,4)*a(4,0);
4220 w[ 4] = a(3,1)*a(4,2)-a(3,2)*a(4,1);
4221 w[ 5] = a(3,1)*a(4,3)-a(3,3)*a(4,1);
4222 w[ 6] = a(3,1)*a(4,4)-a(3,4)*a(4,1);
4223 w[ 7] = a(3,2)*a(4,3)-a(3,3)*a(4,2);
4224 w[ 8] = a(3,2)*a(4,4)-a(3,4)*a(4,2);
4225 w[ 9] = a(3,3)*a(4,4)-a(3,4)*a(4,3);
4226 w[10] = a(2,0)*w[4]-a(2,1)*w[1]+a(2,2)*w[0];
4227 w[11] = a(2,0)*w[5]-a(2,1)*w[2]+a(2,3)*w[0];
4228 w[12] = a(2,0)*w[6]-a(2,1)*w[3]+a(2,4)*w[0];
4229 w[13] = a(2,0)*w[7]-a(2,2)*w[2]+a(2,3)*w[1];
4230 w[14] = a(2,0)*w[8]-a(2,2)*w[3]+a(2,4)*w[1];
4231 w[15] = a(2,0)*w[9]-a(2,3)*w[3]+a(2,4)*w[2];
4232 w[16] = a(2,1)*w[7]-a(2,2)*w[5]+a(2,3)*w[4];
4233 w[17] = a(2,1)*w[8]-a(2,2)*w[6]+a(2,4)*w[4];
4234 w[18] = a(2,1)*w[9]-a(2,3)*w[6]+a(2,4)*w[5];
4235 w[19] = a(2,2)*w[9]-a(2,3)*w[8]+a(2,4)*w[7];
4236
4237 b(0,0) = a(1,1)*w[19]-a(1,2)*w[18]+a(1,3)*w[17]-a(1,4)*w[16];
4238 b(1,0) =-a(1,0)*w[19]+a(1,2)*w[15]-a(1,3)*w[14]+a(1,4)*w[13];
4239 b(2,0) = a(1,0)*w[18]-a(1,1)*w[15]+a(1,3)*w[12]-a(1,4)*w[11];
4240 b(3,0) =-a(1,0)*w[17]+a(1,1)*w[14]-a(1,2)*w[12]+a(1,4)*w[10];
4241 b(4,0) = a(1,0)*w[16]-a(1,1)*w[13]+a(1,2)*w[11]-a(1,3)*w[10];
4242
4243 b(0,1) =-a(0,1)*w[19]+a(0,2)*w[18]-a(0,3)*w[17]+a(0,4)*w[16];
4244 b(1,1) = a(0,0)*w[19]-a(0,2)*w[15]+a(0,3)*w[14]-a(0,4)*w[13];
4245 b(2,1) =-a(0,0)*w[18]+a(0,1)*w[15]-a(0,3)*w[12]+a(0,4)*w[11];
4246 b(3,1) = a(0,0)*w[17]-a(0,1)*w[14]+a(0,2)*w[12]-a(0,4)*w[10];
4247 b(4,1) =-a(0,0)*w[16]+a(0,1)*w[13]-a(0,2)*w[11]+a(0,3)*w[10];
4248
4249 w[10] = a(1,0)*w[4]-a(1,1)*w[1]+a(1,2)*w[0];
4250 w[11] = a(1,0)*w[5]-a(1,1)*w[2]+a(1,3)*w[0];
4251 w[12] = a(1,0)*w[6]-a(1,1)*w[3]+a(1,4)*w[0];
4252 w[13] = a(1,0)*w[7]-a(1,2)*w[2]+a(1,3)*w[1];
4253 w[14] = a(1,0)*w[8]-a(1,2)*w[3]+a(1,4)*w[1];
4254 w[15] = a(1,0)*w[9]-a(1,3)*w[3]+a(1,4)*w[2];
4255 w[16] = a(1,1)*w[7]-a(1,2)*w[5]+a(1,3)*w[4];
4256 w[17] = a(1,1)*w[8]-a(1,2)*w[6]+a(1,4)*w[4];
4257 w[18] = a(1,1)*w[9]-a(1,3)*w[6]+a(1,4)*w[5];
4258 w[19] = a(1,2)*w[9]-a(1,3)*w[8]+a(1,4)*w[7];
4259
4260 b(0,2) = a(0,1)*w[19]-a(0,2)*w[18]+a(0,3)*w[17]-a(0,4)*w[16];
4261 b(1,2) =-a(0,0)*w[19]+a(0,2)*w[15]-a(0,3)*w[14]+a(0,4)*w[13];
4262 b(2,2) = a(0,0)*w[18]-a(0,1)*w[15]+a(0,3)*w[12]-a(0,4)*w[11];
4263 b(3,2) =-a(0,0)*w[17]+a(0,1)*w[14]-a(0,2)*w[12]+a(0,4)*w[10];
4264 b(4,2) = a(0,0)*w[16]-a(0,1)*w[13]+a(0,2)*w[11]-a(0,3)*w[10];
4265
4266 w[ 0] = a(0,0)*a(1,1)-a(0,1)*a(1,0);
4267 w[ 1] = a(0,0)*a(1,2)-a(0,2)*a(1,0);
4268 w[ 2] = a(0,0)*a(1,3)-a(0,3)*a(1,0);
4269 w[ 3] = a(0,0)*a(1,4)-a(0,4)*a(1,0);
4270 w[ 4] = a(0,1)*a(1,2)-a(0,2)*a(1,1);
4271 w[ 5] = a(0,1)*a(1,3)-a(0,3)*a(1,1);
4272 w[ 6] = a(0,1)*a(1,4)-a(0,4)*a(1,1);
4273 w[ 7] = a(0,2)*a(1,3)-a(0,3)*a(1,2);
4274 w[ 8] = a(0,2)*a(1,4)-a(0,4)*a(1,2);
4275 w[ 9] = a(0,3)*a(1,4)-a(0,4)*a(1,3);
4276 w[10] = a(2,0)*w[4]-a(2,1)*w[1]+a(2,2)*w[0];
4277 w[11] = a(2,0)*w[5]-a(2,1)*w[2]+a(2,3)*w[0];
4278 w[12] = a(2,0)*w[6]-a(2,1)*w[3]+a(2,4)*w[0];
4279 w[13] = a(2,0)*w[7]-a(2,2)*w[2]+a(2,3)*w[1];
4280 w[14] = a(2,0)*w[8]-a(2,2)*w[3]+a(2,4)*w[1];
4281 w[15] = a(2,0)*w[9]-a(2,3)*w[3]+a(2,4)*w[2];
4282 w[16] = a(2,1)*w[7]-a(2,2)*w[5]+a(2,3)*w[4];
4283 w[17] = a(2,1)*w[8]-a(2,2)*w[6]+a(2,4)*w[4];
4284 w[18] = a(2,1)*w[9]-a(2,3)*w[6]+a(2,4)*w[5];
4285 w[19] = a(2,2)*w[9]-a(2,3)*w[8]+a(2,4)*w[7];
4286
4287 b(0,3) = a(4,1)*w[19]-a(4,2)*w[18]+a(4,3)*w[17]-a(4,4)*w[16];
4288 b(1,3) = -a(4,0)*w[19]+a(4,2)*w[15]-a(4,3)*w[14]+a(4,4)*w[13];
4289 b(2,3) = a(4,0)*w[18]-a(4,1)*w[15]+a(4,3)*w[12]-a(4,4)*w[11];
4290 b(3,3) = -a(4,0)*w[17]+a(4,1)*w[14]-a(4,2)*w[12]+a(4,4)*w[10];
4291 b(4,3) = a(4,0)*w[16]-a(4,1)*w[13]+a(4,2)*w[11]-a(4,3)*w[10];
4292
4293 b(0,4) = -a(3,1)*w[19]+a(3,2)*w[18]-a(3,3)*w[17]+a(3,4)*w[16];
4294 b(1,4) = a(3,0)*w[19]-a(3,2)*w[15]+a(3,3)*w[14]-a(3,4)*w[13];
4295 b(2,4) = -a(3,0)*w[18]+a(3,1)*w[15]-a(3,3)*w[12]+a(3,4)*w[11];
4296 b(3,4) = a(3,0)*w[17]-a(3,1)*w[14]+a(3,2)*w[12]-a(3,4)*w[10];
4297 b(4,4) = -a(3,0)*w[16]+a(3,1)*w[13]-a(3,2)*w[11]+a(3,3)*w[10];
4298 }
4299 };
4300
4301 template<>
4302 struct CofactorHelper<6,6>
4303 {
4304 template<typename T_, int smb_, int snb_, int sma_, int sna_>
4305 CUDA_HOST_DEVICE static void compute(Tiny::Matrix<T_, 6, 6, smb_, snb_>& b, const Tiny::Matrix<T_, 6, 6, sma_, sna_>& a)
4306 {
4307 T_ w[35];
4308 w[ 0] = a(4,0)*a(5,1)-a(4,1)*a(5,0);
4309 w[ 1] = a(4,0)*a(5,2)-a(4,2)*a(5,0);
4310 w[ 2] = a(4,0)*a(5,3)-a(4,3)*a(5,0);
4311 w[ 3] = a(4,0)*a(5,4)-a(4,4)*a(5,0);
4312 w[ 4] = a(4,0)*a(5,5)-a(4,5)*a(5,0);
4313 w[ 5] = a(4,1)*a(5,2)-a(4,2)*a(5,1);
4314 w[ 6] = a(4,1)*a(5,3)-a(4,3)*a(5,1);
4315 w[ 7] = a(4,1)*a(5,4)-a(4,4)*a(5,1);
4316 w[ 8] = a(4,1)*a(5,5)-a(4,5)*a(5,1);
4317 w[ 9] = a(4,2)*a(5,3)-a(4,3)*a(5,2);
4318 w[10] = a(4,2)*a(5,4)-a(4,4)*a(5,2);
4319 w[11] = a(4,2)*a(5,5)-a(4,5)*a(5,2);
4320 w[12] = a(4,3)*a(5,4)-a(4,4)*a(5,3);
4321 w[13] = a(4,3)*a(5,5)-a(4,5)*a(5,3);
4322 w[14] = a(4,4)*a(5,5)-a(4,5)*a(5,4);
4323 w[15] = a(3,0)*w[5]-a(3,1)*w[1]+a(3,2)*w[0];
4324 w[16] = a(3,0)*w[6]-a(3,1)*w[2]+a(3,3)*w[0];
4325 w[17] = a(3,0)*w[7]-a(3,1)*w[3]+a(3,4)*w[0];
4326 w[18] = a(3,0)*w[8]-a(3,1)*w[4]+a(3,5)*w[0];
4327 w[19] = a(3,0)*w[9]-a(3,2)*w[2]+a(3,3)*w[1];
4328 w[20] = a(3,0)*w[10]-a(3,2)*w[3]+a(3,4)*w[1];
4329 w[21] = a(3,0)*w[11]-a(3,2)*w[4]+a(3,5)*w[1];
4330 w[22] = a(3,0)*w[12]-a(3,3)*w[3]+a(3,4)*w[2];
4331 w[23] = a(3,0)*w[13]-a(3,3)*w[4]+a(3,5)*w[2];
4332 w[24] = a(3,0)*w[14]-a(3,4)*w[4]+a(3,5)*w[3];
4333 w[25] = a(3,1)*w[9]-a(3,2)*w[6]+a(3,3)*w[5];
4334 w[26] = a(3,1)*w[10]-a(3,2)*w[7]+a(3,4)*w[5];
4335 w[27] = a(3,1)*w[11]-a(3,2)*w[8]+a(3,5)*w[5];
4336 w[28] = a(3,1)*w[12]-a(3,3)*w[7]+a(3,4)*w[6];
4337 w[29] = a(3,1)*w[13]-a(3,3)*w[8]+a(3,5)*w[6];
4338 w[30] = a(3,1)*w[14]-a(3,4)*w[8]+a(3,5)*w[7];
4339 w[31] = a(3,2)*w[12]-a(3,3)*w[10]+a(3,4)*w[9];
4340 w[32] = a(3,2)*w[13]-a(3,3)*w[11]+a(3,5)*w[9];
4341 w[33] = a(3,2)*w[14]-a(3,4)*w[11]+a(3,5)*w[10];
4342 w[34] = a(3,3)*w[14]-a(3,4)*w[13]+a(3,5)*w[12];
4343
4344 w[ 0] = a(2,0)*w[25]-a(2,1)*w[19]+a(2,2)*w[16]-a(2,3)*w[15];
4345 w[ 1] = a(2,0)*w[26]-a(2,1)*w[20]+a(2,2)*w[17]-a(2,4)*w[15];
4346 w[ 2] = a(2,0)*w[27]-a(2,1)*w[21]+a(2,2)*w[18]-a(2,5)*w[15];
4347 w[ 3] = a(2,0)*w[28]-a(2,1)*w[22]+a(2,3)*w[17]-a(2,4)*w[16];
4348 w[ 4] = a(2,0)*w[29]-a(2,1)*w[23]+a(2,3)*w[18]-a(2,5)*w[16];
4349 w[ 5] = a(2,0)*w[30]-a(2,1)*w[24]+a(2,4)*w[18]-a(2,5)*w[17];
4350 w[ 6] = a(2,0)*w[31]-a(2,2)*w[22]+a(2,3)*w[20]-a(2,4)*w[19];
4351 w[ 7] = a(2,0)*w[32]-a(2,2)*w[23]+a(2,3)*w[21]-a(2,5)*w[19];
4352 w[ 8] = a(2,0)*w[33]-a(2,2)*w[24]+a(2,4)*w[21]-a(2,5)*w[20];
4353 w[ 9] = a(2,0)*w[34]-a(2,3)*w[24]+a(2,4)*w[23]-a(2,5)*w[22];
4354 w[10] = a(2,1)*w[31]-a(2,2)*w[28]+a(2,3)*w[26]-a(2,4)*w[25];
4355 w[11] = a(2,1)*w[32]-a(2,2)*w[29]+a(2,3)*w[27]-a(2,5)*w[25];
4356 w[12] = a(2,1)*w[33]-a(2,2)*w[30]+a(2,4)*w[27]-a(2,5)*w[26];
4357 w[13] = a(2,1)*w[34]-a(2,3)*w[30]+a(2,4)*w[29]-a(2,5)*w[28];
4358 w[14] = a(2,2)*w[34]-a(2,3)*w[33]+a(2,4)*w[32]-a(2,5)*w[31];
4359
4360 b(0,0) = a(1,1)*w[14]-a(1,2)*w[13]+a(1,3)*w[12]-a(1,4)*w[11]+a(1,5)*w[10];
4361 b(1,0) = -a(1,0)*w[14]+a(1,2)*w[9]-a(1,3)*w[8]+a(1,4)*w[7]-a(1,5)*w[6];
4362 b(2,0) = a(1,0)*w[13]-a(1,1)*w[9]+a(1,3)*w[5]-a(1,4)*w[4]+a(1,5)*w[3];
4363 b(3,0) = -a(1,0)*w[12]+a(1,1)*w[8]-a(1,2)*w[5]+a(1,4)*w[2]-a(1,5)*w[1];
4364 b(4,0) = a(1,0)*w[11]-a(1,1)*w[7]+a(1,2)*w[4]-a(1,3)*w[2]+a(1,5)*w[0];
4365 b(5,0) = -a(1,0)*w[10]+a(1,1)*w[6]-a(1,2)*w[3]+a(1,3)*w[1]-a(1,4)*w[0];
4366
4367 b(0,1) =-a(0,1)*w[14]+a(0,2)*w[13]-a(0,3)*w[12]+a(0,4)*w[11]-a(0,5)*w[10];
4368 b(1,1) = a(0,0)*w[14]-a(0,2)*w[9]+a(0,3)*w[8]-a(0,4)*w[7]+a(0,5)*w[6];
4369 b(2,1) =-a(0,0)*w[13]+a(0,1)*w[9]-a(0,3)*w[5]+a(0,4)*w[4]-a(0,5)*w[3];
4370 b(3,1) = a(0,0)*w[12]-a(0,1)*w[8]+a(0,2)*w[5]-a(0,4)*w[2]+a(0,5)*w[1];
4371 b(4,1) =-a(0,0)*w[11]+a(0,1)*w[7]-a(0,2)*w[4]+a(0,3)*w[2]-a(0,5)*w[0];
4372 b(5,1) = a(0,0)*w[10]-a(0,1)*w[6]+a(0,2)*w[3]-a(0,3)*w[1]+a(0,4)*w[0];
4373
4374 w[ 0] = a(4,0)*a(5,1)-a(4,1)*a(5,0);
4375 w[ 1] = a(4,0)*a(5,2)-a(4,2)*a(5,0);
4376 w[ 2] = a(4,0)*a(5,3)-a(4,3)*a(5,0);
4377 w[ 3] = a(4,0)*a(5,4)-a(4,4)*a(5,0);
4378 w[ 4] = a(4,0)*a(5,5)-a(4,5)*a(5,0);
4379 w[ 5] = a(4,1)*a(5,2)-a(4,2)*a(5,1);
4380 w[ 6] = a(4,1)*a(5,3)-a(4,3)*a(5,1);
4381 w[ 7] = a(4,1)*a(5,4)-a(4,4)*a(5,1);
4382 w[ 8] = a(4,1)*a(5,5)-a(4,5)*a(5,1);
4383 w[ 9] = a(4,2)*a(5,3)-a(4,3)*a(5,2);
4384 w[10] = a(4,2)*a(5,4)-a(4,4)*a(5,2);
4385 w[11] = a(4,2)*a(5,5)-a(4,5)*a(5,2);
4386 w[12] = a(4,3)*a(5,4)-a(4,4)*a(5,3);
4387 w[13] = a(4,3)*a(5,5)-a(4,5)*a(5,3);
4388 w[14] = a(4,4)*a(5,5)-a(4,5)*a(5,4);
4389 w[15] = a(1,0)*w[5]-a(1,1)*w[1]+a(1,2)*w[0];
4390 w[16] = a(1,0)*w[6]-a(1,1)*w[2]+a(1,3)*w[0];
4391 w[17] = a(1,0)*w[7]-a(1,1)*w[3]+a(1,4)*w[0];
4392 w[18] = a(1,0)*w[8]-a(1,1)*w[4]+a(1,5)*w[0];
4393 w[19] = a(1,0)*w[9]-a(1,2)*w[2]+a(1,3)*w[1];
4394 w[20] = a(1,0)*w[10]-a(1,2)*w[3]+a(1,4)*w[1];
4395 w[21] = a(1,0)*w[11]-a(1,2)*w[4]+a(1,5)*w[1];
4396 w[22] = a(1,0)*w[12]-a(1,3)*w[3]+a(1,4)*w[2];
4397 w[23] = a(1,0)*w[13]-a(1,3)*w[4]+a(1,5)*w[2];
4398 w[24] = a(1,0)*w[14]-a(1,4)*w[4]+a(1,5)*w[3];
4399 w[25] = a(1,1)*w[9]-a(1,2)*w[6]+a(1,3)*w[5];
4400 w[26] = a(1,1)*w[10]-a(1,2)*w[7]+a(1,4)*w[5];
4401 w[27] = a(1,1)*w[11]-a(1,2)*w[8]+a(1,5)*w[5];
4402 w[28] = a(1,1)*w[12]-a(1,3)*w[7]+a(1,4)*w[6];
4403 w[29] = a(1,1)*w[13]-a(1,3)*w[8]+a(1,5)*w[6];
4404 w[30] = a(1,1)*w[14]-a(1,4)*w[8]+a(1,5)*w[7];
4405 w[31] = a(1,2)*w[12]-a(1,3)*w[10]+a(1,4)*w[9];
4406 w[32] = a(1,2)*w[13]-a(1,3)*w[11]+a(1,5)*w[9];
4407 w[33] = a(1,2)*w[14]-a(1,4)*w[11]+a(1,5)*w[10];
4408 w[34] = a(1,3)*w[14]-a(1,4)*w[13]+a(1,5)*w[12];
4409 w[ 0] = a(0,0)*w[25]-a(0,1)*w[19]+a(0,2)*w[16]-a(0,3)*w[15];
4410 w[ 1] = a(0,0)*w[26]-a(0,1)*w[20]+a(0,2)*w[17]-a(0,4)*w[15];
4411 w[ 2] = a(0,0)*w[27]-a(0,1)*w[21]+a(0,2)*w[18]-a(0,5)*w[15];
4412 w[ 3] = a(0,0)*w[28]-a(0,1)*w[22]+a(0,3)*w[17]-a(0,4)*w[16];
4413 w[ 4] = a(0,0)*w[29]-a(0,1)*w[23]+a(0,3)*w[18]-a(0,5)*w[16];
4414 w[ 5] = a(0,0)*w[30]-a(0,1)*w[24]+a(0,4)*w[18]-a(0,5)*w[17];
4415 w[ 6] = a(0,0)*w[31]-a(0,2)*w[22]+a(0,3)*w[20]-a(0,4)*w[19];
4416 w[ 7] = a(0,0)*w[32]-a(0,2)*w[23]+a(0,3)*w[21]-a(0,5)*w[19];
4417 w[ 8] = a(0,0)*w[33]-a(0,2)*w[24]+a(0,4)*w[21]-a(0,5)*w[20];
4418 w[ 9] = a(0,0)*w[34]-a(0,3)*w[24]+a(0,4)*w[23]-a(0,5)*w[22];
4419 w[10] = a(0,1)*w[31]-a(0,2)*w[28]+a(0,3)*w[26]-a(0,4)*w[25];
4420 w[11] = a(0,1)*w[32]-a(0,2)*w[29]+a(0,3)*w[27]-a(0,5)*w[25];
4421 w[12] = a(0,1)*w[33]-a(0,2)*w[30]+a(0,4)*w[27]-a(0,5)*w[26];
4422 w[13] = a(0,1)*w[34]-a(0,3)*w[30]+a(0,4)*w[29]-a(0,5)*w[28];
4423 w[14] = a(0,2)*w[34]-a(0,3)*w[33]+a(0,4)*w[32]-a(0,5)*w[31];
4424
4425 b(0,2) = a(3,1)*w[14]-a(3,2)*w[13]+a(3,3)*w[12]-a(3,4)*w[11]+a(3,5)*w[10];
4426 b(1,2) =-a(3,0)*w[14]+a(3,2)*w[9]-a(3,3)*w[8]+a(3,4)*w[7]-a(3,5)*w[6];
4427 b(2,2) = a(3,0)*w[13]-a(3,1)*w[9]+a(3,3)*w[5]-a(3,4)*w[4]+a(3,5)*w[3];
4428 b(3,2) =-a(3,0)*w[12]+a(3,1)*w[8]-a(3,2)*w[5]+a(3,4)*w[2]-a(3,5)*w[1];
4429 b(4,2) = a(3,0)*w[11]-a(3,1)*w[7]+a(3,2)*w[4]-a(3,3)*w[2]+a(3,5)*w[0];
4430 b(5,2) =-a(3,0)*w[10]+a(3,1)*w[6]-a(3,2)*w[3]+a(3,3)*w[1]-a(3,4)*w[0];
4431
4432 b(0,3) =-a(2,1)*w[14]+a(2,2)*w[13]-a(2,3)*w[12]+a(2,4)*w[11]-a(2,5)*w[10];
4433 b(1,3) = a(2,0)*w[14]-a(2,2)*w[9]+a(2,3)*w[8]-a(2,4)*w[7]+a(2,5)*w[6];
4434 b(2,3) =-a(2,0)*w[13]+a(2,1)*w[9]-a(2,3)*w[5]+a(2,4)*w[4]-a(2,5)*w[3];
4435 b(3,3) = a(2,0)*w[12]-a(2,1)*w[8]+a(2,2)*w[5]-a(2,4)*w[2]+a(2,5)*w[1];
4436 b(4,3) =-a(2,0)*w[11]+a(2,1)*w[7]-a(2,2)*w[4]+a(2,3)*w[2]-a(2,5)*w[0];
4437 b(5,3) = a(2,0)*w[10]-a(2,1)*w[6]+a(2,2)*w[3]-a(2,3)*w[1]+a(2,4)*w[0];
4438
4439 w[ 0] = a(2,0)*a(3,1)-a(2,1)*a(3,0);
4440 w[ 1] = a(2,0)*a(3,2)-a(2,2)*a(3,0);
4441 w[ 2] = a(2,0)*a(3,3)-a(2,3)*a(3,0);
4442 w[ 3] = a(2,0)*a(3,4)-a(2,4)*a(3,0);
4443 w[ 4] = a(2,0)*a(3,5)-a(2,5)*a(3,0);
4444 w[ 5] = a(2,1)*a(3,2)-a(2,2)*a(3,1);
4445 w[ 6] = a(2,1)*a(3,3)-a(2,3)*a(3,1);
4446 w[ 7] = a(2,1)*a(3,4)-a(2,4)*a(3,1);
4447 w[ 8] = a(2,1)*a(3,5)-a(2,5)*a(3,1);
4448 w[ 9] = a(2,2)*a(3,3)-a(2,3)*a(3,2);
4449 w[10] = a(2,2)*a(3,4)-a(2,4)*a(3,2);
4450 w[11] = a(2,2)*a(3,5)-a(2,5)*a(3,2);
4451 w[12] = a(2,3)*a(3,4)-a(2,4)*a(3,3);
4452 w[13] = a(2,3)*a(3,5)-a(2,5)*a(3,3);
4453 w[14] = a(2,4)*a(3,5)-a(2,5)*a(3,4);
4454 w[15] = a(1,0)*w[5]-a(1,1)*w[1]+a(1,2)*w[0];
4455 w[16] = a(1,0)*w[6]-a(1,1)*w[2]+a(1,3)*w[0];
4456 w[17] = a(1,0)*w[7]-a(1,1)*w[3]+a(1,4)*w[0];
4457 w[18] = a(1,0)*w[8]-a(1,1)*w[4]+a(1,5)*w[0];
4458 w[19] = a(1,0)*w[9]-a(1,2)*w[2]+a(1,3)*w[1];
4459 w[20] = a(1,0)*w[10]-a(1,2)*w[3]+a(1,4)*w[1];
4460 w[21] = a(1,0)*w[11]-a(1,2)*w[4]+a(1,5)*w[1];
4461 w[22] = a(1,0)*w[12]-a(1,3)*w[3]+a(1,4)*w[2];
4462 w[23] = a(1,0)*w[13]-a(1,3)*w[4]+a(1,5)*w[2];
4463 w[24] = a(1,0)*w[14]-a(1,4)*w[4]+a(1,5)*w[3];
4464 w[25] = a(1,1)*w[9]-a(1,2)*w[6]+a(1,3)*w[5];
4465 w[26] = a(1,1)*w[10]-a(1,2)*w[7]+a(1,4)*w[5];
4466 w[27] = a(1,1)*w[11]-a(1,2)*w[8]+a(1,5)*w[5];
4467 w[28] = a(1,1)*w[12]-a(1,3)*w[7]+a(1,4)*w[6];
4468 w[29] = a(1,1)*w[13]-a(1,3)*w[8]+a(1,5)*w[6];
4469 w[30] = a(1,1)*w[14]-a(1,4)*w[8]+a(1,5)*w[7];
4470 w[31] = a(1,2)*w[12]-a(1,3)*w[10]+a(1,4)*w[9];
4471 w[32] = a(1,2)*w[13]-a(1,3)*w[11]+a(1,5)*w[9];
4472 w[33] = a(1,2)*w[14]-a(1,4)*w[11]+a(1,5)*w[10];
4473 w[34] = a(1,3)*w[14]-a(1,4)*w[13]+a(1,5)*w[12];
4474
4475 w[ 0] = a(0,0)*w[25]-a(0,1)*w[19]+a(0,2)*w[16]-a(0,3)*w[15];
4476 w[ 1] = a(0,0)*w[26]-a(0,1)*w[20]+a(0,2)*w[17]-a(0,4)*w[15];
4477 w[ 2] = a(0,0)*w[27]-a(0,1)*w[21]+a(0,2)*w[18]-a(0,5)*w[15];
4478 w[ 3] = a(0,0)*w[28]-a(0,1)*w[22]+a(0,3)*w[17]-a(0,4)*w[16];
4479 w[ 4] = a(0,0)*w[29]-a(0,1)*w[23]+a(0,3)*w[18]-a(0,5)*w[16];
4480 w[ 5] = a(0,0)*w[30]-a(0,1)*w[24]+a(0,4)*w[18]-a(0,5)*w[17];
4481 w[ 6] = a(0,0)*w[31]-a(0,2)*w[22]+a(0,3)*w[20]-a(0,4)*w[19];
4482 w[ 7] = a(0,0)*w[32]-a(0,2)*w[23]+a(0,3)*w[21]-a(0,5)*w[19];
4483 w[ 8] = a(0,0)*w[33]-a(0,2)*w[24]+a(0,4)*w[21]-a(0,5)*w[20];
4484 w[ 9] = a(0,0)*w[34]-a(0,3)*w[24]+a(0,4)*w[23]-a(0,5)*w[22];
4485 w[10] = a(0,1)*w[31]-a(0,2)*w[28]+a(0,3)*w[26]-a(0,4)*w[25];
4486 w[11] = a(0,1)*w[32]-a(0,2)*w[29]+a(0,3)*w[27]-a(0,5)*w[25];
4487 w[12] = a(0,1)*w[33]-a(0,2)*w[30]+a(0,4)*w[27]-a(0,5)*w[26];
4488 w[13] = a(0,1)*w[34]-a(0,3)*w[30]+a(0,4)*w[29]-a(0,5)*w[28];
4489 w[14] = a(0,2)*w[34]-a(0,3)*w[33]+a(0,4)*w[32]-a(0,5)*w[31];
4490
4491 b(0,4) = a(5,1)*w[14]-a(5,2)*w[13]+a(5,3)*w[12]-a(5,4)*w[11]+a(5,5)*w[10];
4492 b(1,4) =-a(5,0)*w[14]+a(5,2)*w[9]-a(5,3)*w[8]+a(5,4)*w[7]-a(5,5)*w[6];
4493 b(2,4) = a(5,0)*w[13]-a(5,1)*w[9]+a(5,3)*w[5]-a(5,4)*w[4]+a(5,5)*w[3];
4494 b(3,4) =-a(5,0)*w[12]+a(5,1)*w[8]-a(5,2)*w[5]+a(5,4)*w[2]-a(5,5)*w[1];
4495 b(4,4) = a(5,0)*w[11]-a(5,1)*w[7]+a(5,2)*w[4]-a(5,3)*w[2]+a(5,5)*w[0];
4496 b(5,4) =-a(5,0)*w[10]+a(5,1)*w[6]-a(5,2)*w[3]+a(5,3)*w[1]-a(5,4)*w[0];
4497
4498 b(0,5) =-a(4,1)*w[14]+a(4,2)*w[13]-a(4,3)*w[12]+a(4,4)*w[11]-a(4,5)*w[10];
4499 b(1,5) = a(4,0)*w[14]-a(4,2)*w[9]+a(4,3)*w[8]-a(4,4)*w[7]+a(4,5)*w[6];
4500 b(2,5) =-a(4,0)*w[13]+a(4,1)*w[9]-a(4,3)*w[5]+a(4,4)*w[4]-a(4,5)*w[3];
4501 b(3,5) = a(4,0)*w[12]-a(4,1)*w[8]+a(4,2)*w[5]-a(4,4)*w[2]+a(4,5)*w[1];
4502 b(4,5) =-a(4,0)*w[11]+a(4,1)*w[7]-a(4,2)*w[4]+a(4,3)*w[2]-a(4,5)*w[0];
4503 b(5,5) = a(4,0)*w[10]-a(4,1)*w[6]+a(4,2)*w[3]-a(4,3)*w[1]+a(4,4)*w[0];
4504 }
4505 };
4506 } // namespace Intern
4508 } // namespace Tiny
4509} // namespace FEAT
#define XABORTM(msg)
Abortion macro definition with custom message.
Definition: assertion.hpp:192
#define ASSERTM(expr, msg)
Debug-Assertion macro definition with custom message.
Definition: assertion.hpp:230
#define XASSERTM(expr, msg)
Assertion macro definition with custom message.
Definition: assertion.hpp:263
FEAT Kernel base header.
Math Limits class template.
Definition: math.hpp:1475
Tiny Matrix class template.
CUDA_HOST_DEVICE DataType det() const
Returns the determinant of the matrix.
CUDA_HOST_DEVICE Matrix(DataType value)
value-assignment constructor
CUDA_HOST_DEVICE Matrix & set_cofactor(const Matrix< T_, m_, n_, sma_, sna_ > &a)
Sets this matrix to the cofactor matrix of another matrix.
CUDA_HOST_DEVICE Matrix & add_scalar_main_diag(DataType alpha)
Adds a value onto the matrix's main diagonal.
CUDA_HOST_DEVICE Matrix & add_outer_product(const Vector< T_, m_, snx_ > &x, const Vector< T_, n_, sny_ > &y, const DataType alpha=DataType(1))
Adds the outer product of two vectors onto the matrix.
CUDA_HOST_DEVICE Matrix & add_mat_mat_mult(const Matrix< T_, m_, la_, sma_, sna_ > &a, const Matrix< T_, lb_, n_, smb_, snb_ > &b, DataType alpha=DataType(1))
Adds the algebraic matrix-product of two other matrices onto this matrix.
CUDA_HOST_DEVICE Matrix & set_mat_mat_mult(const Matrix< T_, m_, la_, sma_, sna_ > &a, const Matrix< T_, lb_, n_, smb_, snb_ > &b)
Sets this matrix to the algebraic matrix-product of two other matrices.
CUDA_HOST_DEVICE DataType norm_frobenius_sqr() const
Returns the Frobenius norm squared of the matrix.
T_ ValueType
the data type of the matrix
CUDA_HOST_DEVICE Matrix & operator=(const Matrix< T_, m_, n_, sma_, sna_ > &a)
assignment operator
CUDA_HOST_DEVICE Matrix(const std::initializer_list< std::initializer_list< Tx_ > > &x)
Initializer list constructor.
CUDA_HOST_DEVICE Matrix & add_vec_tensor_mult(const Vector< T_, l_, snv_ > &x, const Tensor3< T_, l_, m_, n_, slt_, smt_, snt_ > &t, DataType alpha=DataType(1))
Adds the result of a vector-tensor left-product onto this matrix.
CUDA_HOST_DEVICE Matrix & set_transpose(const Matrix< T_, n_, m_, sma_, sna_ > &a)
Sets this matrix to the transpose of another matrix.
CUDA_HOST_DEVICE Matrix & axpy(DataType alpha, const Matrix< T_, m_, n_, sma_, sna_ > &a)
Adds another scaled matrix onto this matrix.
CUDA_HOST_DEVICE Matrix & operator-=(const Matrix< T_, m_, n_, sma_, sna_ > &a)
matrix component-wise subtraction operator
CUDA_HOST_DEVICE Matrix(const Matrix< T2_, m_, n_, sma_, sna_ > &a)
copy constructor
static CUDA_HOST_DEVICE Matrix null()
Returns a null-matrix.
CUDA_HOST_DEVICE DataType norm_sub_id_frobenius() const
Returns the Frobenius norm of the difference of this matrix and the identity matrix.
CUDA_HOST_DEVICE Matrix & set_inverse(const Matrix< T_, m_, n_, sma_, sna_ > &a)
Sets this matrix to the inverse of another matrix.
CUDA_HOST_DEVICE const RowType & operator[](int i) const
Row-Access operator.
CUDA_HOST_DEVICE Matrix & operator*=(DataType alpha)
scalar-right-multiply-by operator
CUDA_HOST_DEVICE DataType scalar_product(const Vector< T_, m_, snx_ > &x, const Vector< T_, n_, sny_ > &y) const
Computes the scalar product of two vectors with this matrix.
Intern::DataTypeExtractor< ValueType >::MyDataType DataType
The basic data type buried in the lowest level of the vector.
Vector< T_, n_, sn_ > RowType
the type of a single matrix row
CUDA_HOST_DEVICE const T_ & operator()(int i, int j) const
Access operator.
CUDA_HOST_DEVICE Matrix & operator=(DataType value)
value-assignment operator
CUDA_HOST_DEVICE Matrix & set_rotation_3d(T_ yaw, T_ pitch, T_ roll)
Sets this matrix to a 3D yaw-pitch-roll rotation matrix.
CUDA_HOST_DEVICE DataType trace() const
Returns the trace of the matrix.
CUDA_HOST_DEVICE DataType norm_hessian_sqr() const
Returns the Hessian norm square.
CUDA_HOST_DEVICE Matrix(const std::initializer_list< Tx_ > &x)
Initializer list of Tiny::Vector constructor.
CUDA_HOST_DEVICE Matrix & set_gram(const Matrix< T_, l_, n_, sla_, sna_ > &a)
Sets this matrix to the Gram matrix of another matrix.
CUDA_HOST_DEVICE Matrix & set_rotation_2d(T_ angle)
Sets this matrix to a 2D rotation matrix.
CUDA_HOST_DEVICE Matrix & set_double_mat_mult(const Matrix< T_, k_, l_, sma_, sna_ > &a, const Matrix< T_, k_, m_, smb_, snb_ > &b, const Matrix< T_, l_, n_, smd_, snd_ > &d, T_ alpha=T_(1))
Sets this matrix to the algebraic matrix double-product of three other matrices.
CUDA_HOST_DEVICE T_ & operator()(int i, int j)
Access operator.
CUDA_HOST_DEVICE Matrix & add_double_mat_mult(const Matrix< T_, k_, l_, sma_, sna_ > &a, const Matrix< T_, k_, m_, smb_, snb_ > &b, const Matrix< T_, l_, n_, smd_, snd_ > &d, DataType alpha=DataType(1))
Adds the algebraic matrix double-product of three other matrices onto this matrix.
CUDA_HOST_DEVICE void convert(const Matrix< Tx_, m_, n_, sma_, sna_ > &a)
conversion operator
CUDA_HOST_DEVICE Matrix & set_identity()
Sets this matrix to the identity matrix.
CUDA_HOST_DEVICE Matrix & operator+=(const Matrix< T_, m_, n_, sma_, sna_ > &a)
matrix component-wise addition operator
CUDA_HOST_DEVICE RowType & operator[](int i)
Row-Access operator.
CUDA_HOST_DEVICE Matrix & operator=(const std::initializer_list< std::initializer_list< Tx_ > > &x)
Initializer list assignment operator.
CUDA_HOST_DEVICE Matrix()
default constructor
CUDA_HOST_DEVICE void copy_n(const Matrix< T_, ma_, na_, sma_, sna_ > &a)
Copies the upper left mm_ x nn_ entries of a matrix.
CUDA_HOST_DEVICE DataType vol() const
Returns the volume of the matrix.
RowType v[sm_]
actual matrix data; that's an array of vectors
CUDA_HOST_DEVICE Matrix & operator=(const std::initializer_list< Tx_ > &x)
Initializer list assignment operator.
CUDA_HOST_DEVICE Matrix & set_vec_tensor_mult(const Vector< T_, l_, snv_ > &x, const Tensor3< T_, l_, m_, n_, slt_, smt_, snt_ > &t, DataType alpha=DataType(1))
Sets this matrix to the result of a vector-tensor left-product.
CUDA_HOST_DEVICE Matrix & set_outer_product(const Vector< T_, m_, snx_ > &x, const Vector< T_, n_, sny_ > &y)
Sets this matrix to the outer product of two vectors.
CUDA_HOST_DEVICE void format(DataType alpha=DataType(0))
Formats the matrix.
CUDA_HOST_DEVICE void copy(const Matrix< T_, m_, n_, sma_, sna_ > &a)
Copies a matrix.
CUDA_HOST_DEVICE DataType norm_frobenius() const
Returns the Frobenius norm of the matrix.
Tiny Tensor3 class template.
CUDA_HOST_DEVICE Tensor3 & add_double_mat_mult(const Tensor3< T_, lt_, mt_, nt_, slt_, smt_, snt_ > &t, const Matrix< T_, nt_, n_, smb_, snb_ > &b, const Matrix< T_, mt_, m_, smd_, snd_ > &d, DataType alpha=DataType(1))
Adds the result of a matrix-tensor-matrix double-product onto this tensor.
Intern::DataTypeExtractor< ValueType >::MyDataType DataType
The basic data type buried in the lowest level of the vector.
CUDA_HOST_DEVICE void copy_n(const Tensor3< T_, la_, ma_, na_, sla_, sma_, sna_ > &a)
Copies the upper left mm_ x nn_ entries of a matrix.
static constexpr int l
the tube count of the tensor
CUDA_HOST_DEVICE Tensor3 & operator=(const std::initializer_list< Tx_ > &x)
Initializer list assignment operator.
static constexpr int sn
the column stride of the tensor
CUDA_HOST_DEVICE Tensor3 & operator=(const Tensor3< T_, l_, m_, n_, sla_, sma_, sna_ > &a)
copy-assignment operator
CUDA_HOST_DEVICE void format(DataType alpha=DataType(0))
formats the tensor
CUDA_HOST_DEVICE const T_ & operator()(int h, int i, int j) const
Access operator.
CUDA_HOST_DEVICE Tensor3 & operator=(DataType value)
value-assignment operator
CUDA_HOST_DEVICE Tensor3 & operator=(const std::initializer_list< std::initializer_list< std::initializer_list< Tx_ > > > &x)
Initializer list assignment operator.
CUDA_HOST_DEVICE Tensor3 & add_vec_mat_outer_product(const Vector< T_, l_, slx_ > &x, const Matrix< T_, m_, n_, sma_, sna_ > &a, DataType alpha=DataType(1))
Adds the result of a vector-matrix outer product onto this tensor.
CUDA_HOST_DEVICE Tensor3 & operator+=(const Tensor3< T_, l_, m_, n_, sla_, sma_, sna_ > &a)
tensor component-wise addition operator
CUDA_HOST_DEVICE Tensor3(const std::initializer_list< std::initializer_list< std::initializer_list< Tx_ > > > &x)
Initializer list constructor.
CUDA_HOST_DEVICE const PlaneType & operator[](int h) const
Plane-Access operator.
CUDA_HOST_DEVICE Tensor3(const Tensor3< T_, l_, m_, n_, sla_, sma_, sna_ > &a)
copy-constructor
Matrix< T_, m_, n_, sm_, sn_ > PlaneType
Type of tensor data; that's an array of matrices.
static constexpr int sl
the tube stride of the tensor
CUDA_HOST_DEVICE Tensor3(const std::initializer_list< Tx_ > &x)
Initializer list constructor.
CUDA_HOST_DEVICE PlaneType & operator[](int h)
Plane-Access operator.
CUDA_HOST_DEVICE Tensor3(DataType value)
value-assignment constructor
static CUDA_HOST_DEVICE Tensor3 null()
Returns a null-tensor.
CUDA_HOST_DEVICE Tensor3 & add_mat_tensor_mult(const Matrix< T_, l_, k_, sma_, sna_ > &a, const Tensor3< T_, k_, m_, n_, slt_, smt_, snt_ > &t, DataType alpha=DataType(1))
Adds the result of a matrix-tensor product onto this tensor.
PlaneType v[sl_]
Actual tensor data.
static constexpr int m
the row count of the tensor
CUDA_HOST_DEVICE void convert(const Tensor3< Tx_, l_, m_, n_, sla_, sma_, sna_ > &a)
conversion operator
CUDA_HOST_DEVICE Tensor3()
default constructor
CUDA_HOST_DEVICE Tensor3 & operator-=(const Tensor3< T_, l_, m_, n_, sla_, sma_, sna_ > &a)
tensor component-wise subtraction operator
CUDA_HOST_DEVICE T_ & operator()(int h, int i, int j)
Access operator.
static constexpr int n
the column count of the tensor
static constexpr int sm
the row stride of the tensor
T_ ValueType
the data type of the tensor
CUDA_HOST_DEVICE Tensor3 & operator*=(DataType alpha)
scalar right-multiply-by operator
CUDA_HOST_DEVICE void copy(const Tensor3< T_, l_, m_, n_, sla_, sma_, sna_ > &a)
Copies a tensor3.
Tiny Vector class template.
static CUDA_HOST_DEVICE Vector null()
Returns a null-vector.
CUDA_HOST_DEVICE DataType norm_euclid_n() const
Computes the euclid norm of first nn_ entries of this vector.
CUDA_HOST_DEVICE Vector & add_mat_vec_mult(const Matrix< T_, n_, m_, sma_, sna_ > &a, const Vector< T_, m_, sx_ > &x, DataType alpha=DataType(1))
Adds the result of a matrix-vector product onto this vector.
CUDA_HOST_DEVICE DataType norm_euclid_sqr_n() const
Computes the squared euclid norm of first nn_ entries of this vector.
CUDA_HOST_DEVICE Vector & normalize_n()
Normalizes the first nn_ entries of this vector.
CUDA_HOST_DEVICE Vector & operator=(const Vector< T_, n_, sx_ > &x)
copy-assignment operator
T_ v[s_]
actual vector data
CUDA_HOST_DEVICE bool normalized() const
Return if the vector is normalized.
CUDA_HOST_DEVICE void format_n(DataType alpha=DataType(0))
Formats the first nn_ entries of the vector.
static CUDA_HOST_DEVICE Vector convert_new(Vector &&x)
overload for moveable rvalue type
CUDA_HOST_DEVICE void format(DataType alpha=DataType(0))
Formats the vector.
CUDA_HOST_DEVICE Vector(const Vector< T_, n_, sx_ > &x)
copy constructor
CUDA_HOST_DEVICE Vector & operator=(const std::initializer_list< Tx_ > &x)
Initializer list assignment operator.
CUDA_HOST_DEVICE DataType norm_max() const
Computes the max-norm of this vector.
CUDA_HOST_DEVICE T_ & operator()(int i)
Access operator.
CUDA_HOST_DEVICE const T_ & operator[](int i) const
Access operator.
CUDA_HOST_DEVICE Vector & operator-=(const Vector< T_, n_, sx_ > &x)
vector-subtract operator
CUDA_HOST_DEVICE Vector & set_vec_mat_mult(const Vector< T_, m_, sx_ > &x, const Matrix< T_, m_, n_, sma_, sna_ > &a)
Sets this vector to the result of a vector-matrix product.
static CUDA_HOST_DEVICE Vector convert_new(const Vector< Tx_, n_, sx_ > &x)
convert function, not callable with non convertable inner type
CUDA_HOST_DEVICE Vector & add_vec_mat_mult_n(const Vector< T_, mx_, smx_ > &x, const Matrix< T_, ma_, na_, sma_, sna_ > &a, DataType alpha=DataType(1))
Adds the result of a vector-matrix product onto this vector.
CUDA_HOST_DEVICE void scale_n(DataType alpha)
Scales the first nn_ entries of the vector.
CUDA_HOST_DEVICE Vector & set_mat_vec_mult_n(const Matrix< T_, ma_, na_, sma_, sna_ > &a, const Vector< T_, nx_, sx_ > &x)
Sets the first nn_ entries of this vector to the result of a matrix-vector product with the first mm_...
static constexpr int n
the length of the vector
CUDA_HOST_DEVICE Vector & operator*=(DataType alpha)
scalar-multiply operator
CUDA_HOST_DEVICE Vector()
default constructor
CUDA_HOST_DEVICE DataType norm_l1() const
Computes the l1-norm of this vector.
CUDA_HOST_DEVICE Vector & set_convex(DataType alpha, const Vector< T_, n_, sna_ > &a, const Vector< T_, n_, snb_ > &b)
Sets this vector to the convex combination of two other vectors.
CUDA_HOST_DEVICE Vector & axpy_n(DataType alpha, const Vector< T_, nx_, snx_ > &x)
Adds the first nn_ entries of another scaled vector onto this vector.
CUDA_HOST_DEVICE Vector & add_mat_vec_mult_n(const Matrix< T_, ma_, na_, sma_, sna_ > &a, const Vector< T_, nx_, sx_ > &x, DataType alpha=DataType(1))
Adds the result of a matrix-vector product onto this vector.
CUDA_HOST friend std::ostream & operator<<(std::ostream &lhs, const Vector &b)
Tiny::Vector streaming operator.
CUDA_HOST_DEVICE Vector & operator+=(const Vector< T_, n_, sx_ > &x)
vector-add operator
CUDA_HOST_DEVICE Vector & operator=(DataType value)
value-assignment operator
CUDA_HOST_DEVICE const T_ & operator()(int i) const
Access operator.
CUDA_HOST_DEVICE Vector & axpy(DataType alpha, const Vector< T_, n_, snx_ > &x)
Adds another scaled vector onto this vector.
CUDA_HOST_DEVICE void copy(const Vector< T_, n_, snx_ > &x)
Copies a vector.
CUDA_HOST_DEVICE Vector & normalize()
Normalizes this vector.
CUDA_HOST_DEVICE Vector & negate()
Negates the vector, i.e. effectively multiplies all components by -1.
Intern::DataTypeExtractor< ValueType >::MyDataType DataType
The basic data type buried in the lowest level of the vector.
CUDA_HOST_DEVICE Vector(DataType value)
value-assignment constructor
CUDA_HOST_DEVICE DataType norm_max_n() const
Computes the max-norm of the vector.
CUDA_HOST_DEVICE Vector & negate_n()
Negates the first nn_ entries of this vector.
CUDA_HOST_DEVICE DataType norm_l1_n() const
Computes the l1-norm of the first nn_ entries of this vector.
CUDA_HOST_DEVICE Vector & set_convex_n(DataType alpha, const Vector< T_, na_, sna_ > &a, const Vector< T_, nb_, snb_ > &b)
Sets the first nn_ entries of this vector to the convex combination of two other vectors.
CUDA_HOST_DEVICE DataType norm_euclid_sqr() const
Computes the squared euclid norm of this vector.
CUDA_HOST_DEVICE void copy_n(const Vector< T_, nx_, snx_ > &x)
Copies the first nn_ entries of a vector.
CUDA_HOST_DEVICE void scale(DataType alpha)
Scales the vector.
CUDA_HOST_DEVICE Vector(const std::initializer_list< Tx_ > &x)
Initializer list constructor.
static constexpr int s
the stride of the vector
CUDA_HOST_DEVICE Vector & set_mat_vec_mult(const Matrix< T_, n_, m_, sma_, sna_ > &a, const Vector< T_, m_, sx_ > &x)
Sets this vector to the result of a matrix-vector product.
CUDA_HOST_DEVICE Vector & add_vec_mat_mult(const Vector< T_, m_, sx_ > &x, const Matrix< T_, m_, n_, sma_, sna_ > &a, DataType alpha=DataType(1))
Adds the result of a vector-matrix product onto this vector.
CUDA_HOST_DEVICE T_ & operator[](int i)
Access operator.
T_ ValueType
the value type of the vector
CUDA_HOST_DEVICE DataType norm_euclid() const
Computes the euclid norm of this vector.
CUDA_HOST_DEVICE Vector & set_vec_mat_mult_n(const Vector< T_, mx_, smx_ > &x, const Matrix< T_, ma_, na_, sma_, sna_ > &a)
Sets the first mm_ entries of this vector to the result of a vector-matrix product with the first mm_...
CUDA_HOST_DEVICE void convert(const Vector< Tx_, n_, sx_ > &x)
conversion operator
CUDA_HOST_DEVICE Vector & operator*=(const Vector< T_, n_, sx_ > &x)
element-wise-multiply operator
T_ sqrt(T_ x)
Returns the square-root of a value.
Definition: math.hpp:300
T_ abs(T_ x)
Returns the absolute value.
Definition: math.hpp:275
DT_ calc_opening_angle(DT_ x1, DT_ x2, DT_ y1, DT_ y2)
Calculates the opening angle of two 2D vectors.
Definition: math.hpp:1450
T_ pow(T_ x, T_ y)
Returns x raised to the power of y.
Definition: math.hpp:643
T_ sin(T_ x)
Returns the sine of a value.
Definition: math.hpp:344
T_ sqr(T_ x)
Returns the square of a value.
Definition: math.hpp:95
T_ min(T_ a, T_ b)
Returns the minimum of two values.
Definition: math.hpp:123
T_ cos(T_ x)
Returns the cosine of a value.
Definition: math.hpp:386
T_ max(T_ a, T_ b)
Returns the maximum of two values.
Definition: math.hpp:137
CUDA_HOST_DEVICE Vector< T_, n_ > operator+(const Vector< T_, n_, sa_ > &a, const Vector< T_, n_, sb_ > &b)
vector addition operator
CUDA_HOST Vector< T_, dim_ > project_onto(const Vector< T_, dim_ > &x, const Vector< T_, dim_ > &y)
Calculates the projected vector.
CUDA_HOST_DEVICE T_ dot(const T_ &a, const T_ &b)
Computes the dot-product of two scalars.
CUDA_HOST_DEVICE void orthogonal_3x2(Vector< T_, mx_, smx_ > &nu, const Matrix< T_, ma_, na_, sm_, sn_ > &tau)
Computes the positively oriented orthogonal vector to the columns of a 3x2 matrix.
CUDA_HOST_DEVICE void axpy(T_ &y, const T_ &x, const T_ &alpha)
Performs an AXPY of two scalars.
CUDA_HOST_DEVICE void orthogonal_2x1(Vector< T_, mx_, smx_ > &nu, const Matrix< T_, ma_, na_, sm_, sn_ > &tau)
Computes the positively oriented orthogonal vector to the columns of a 2x1 matrix.
CUDA_HOST_DEVICE void add_id(T_ &x, const T_ &alpha)
Adds a scaled identity onto a scalar.
Vector< T_, m_ > orthogonal(const Matrix< T_, m_, m_-1, sm_, sn_ > &tau)
Computes the positively oriented orthogonal vector to the columns of a m_ x (m_-1) Matrix.
CUDA_HOST_DEVICE Vector< T_, n_ > operator-(const Vector< T_, n_, sa_ > &a, const Vector< T_, n_, sb_ > &b)
vector subtraction operator
CUDA_HOST_DEVICE Vector< T_, n_ > component_product(const Vector< T_, n_, sa_ > &a, const Vector< T_, n_, sb_ > &b)
vector element-wise-product operator
CUDA_HOST_DEVICE Vector< T_, n_ > operator*(typename Vector< T_, n_ >::DataType alpha, const Vector< T_, n_, s_ > &x)
scalar-left-multiplication operator
CUDA_HOST T_ calculate_opening_angle(const Vector< T_, 2 > &x, const Vector< T_, 2 > &y)
Calculates the counter-clockwise opening angle between two 2D vectors.
FEAT namespace.
Definition: adjactor.hpp:12
String stringify(const T_ &item)
Converts an item into a String.
Definition: string.hpp:944
@ value
specifies whether the space should supply basis function values