FEAT 3
Finite Element Analysis Toolbox
Loading...
Searching...
No Matches
tiny_algebra.hpp
1// FEAT3: Finite Element Analysis Toolbox, Version 3
2// Copyright (C) 2010 by Stefan Turek & the FEAT group
3// FEAT3 is released under the GNU General Public License version 3,
4// see the file 'copyright.txt' in the top level directory for details.
5
6#pragma once
7
8#include <iostream>
9
10#include <ios>
13#include <limits>
14#ifndef __CUDA_ARCH__
15#include <kernel/util/math.hpp>
16#endif
17
18// includes, system
19#ifndef __CUDACC__
20#include <initializer_list>
21#else
22#include <kernel/util/cuda_math.cuh>
23#endif
24
25namespace FEAT
26{
33 namespace Tiny
34 {
52 template<
53 typename T_,
54 int n_,
55 int s_ = n_>
56 class Vector DOXY({});
57
78 template<
79 typename T_,
80 int m_,
81 int n_,
82 int sm_ = m_,
83 int sn_ = n_>
84 class Matrix DOXY({});
85
112 template<
113 typename T_,
114 int l_,
115 int m_,
116 int n_,
117 int sl_ = l_,
118 int sm_ = m_,
119 int sn_ = n_>
120 class Tensor3 DOXY({});
121
123 namespace Intern
124 {
133 template<typename T_>
134 struct DataTypeExtractor
135 {
137 typedef T_ MyDataType;
139 static constexpr int level = 0;
140 };
141
156 template<typename T_, int n_, int s_>
157 struct DataTypeExtractor<Vector<T_, n_, s_>>
158 {
160 typedef typename DataTypeExtractor<T_>::MyDataType MyDataType;
162 static constexpr int level = DataTypeExtractor<T_>::level+1;
163 };
164
165 // Same for Matrix
166 template<typename T_, int m_, int n_, int sm_, int sn_>
167 struct DataTypeExtractor<Matrix<T_, m_, n_, sm_, sn_>>
168 {
170 typedef typename DataTypeExtractor<T_>::MyDataType MyDataType;
172 static constexpr int level = DataTypeExtractor<T_>::level+1;
173 };
174
175 // Same for Tensor3
176 template<typename T_, int l_, int m_, int n_, int sl_, int sm_, int sn_>
177 struct DataTypeExtractor<Tensor3<T_, l_, m_, n_, sl_, sm_, sn_>>
178 {
180 typedef typename DataTypeExtractor<T_>::MyDataType MyDataType;
182 static constexpr int level = DataTypeExtractor<T_>::level+1;
183 };
184
185 // forward declarations of helper classes
186 template<int m_, int n_>
187 struct DetHelper;
188
189 template<int m_, int n_>
190 struct VolHelper;
191
192 template<int m_, int n_>
193 struct InverseHelper;
194
195 template<int m_, int n_>
196 struct CofactorHelper;
197
198#ifdef __CUDACC__
199 template<int m_, int n_>
200 struct CudaGroupedInverseHelper;
201#endif
202 } // namespace Intern
204
205 /* ************************************************************************************************************* */
206 /* ************************************************************************************************************* */
207 // Tiny Vector implementation
208 /* ************************************************************************************************************* */
209 /* ************************************************************************************************************* */
210
211 template<typename T_, int n_, int s_>
212 class Vector
213 {
214 static_assert(n_ > 0, "invalid vector length");
215 static_assert(s_ >= n_, "invalid vector stride");
216
217 public:
219 static constexpr int n = n_;
221 static constexpr int s = s_;
222
224 typedef T_ ValueType;
226 typedef typename Intern::DataTypeExtractor<ValueType>::MyDataType DataType;
227
229 T_ v[s_];
230
232 CUDA_HOST_DEVICE Vector()
233 {
234 }
235
237 CUDA_HOST_DEVICE explicit Vector(DataType value)
238 {
239 for(int i(0); i < n_; ++i)
240 {
241 v[i] = value;
242 }
243 }
244
246 template<int sx_>
247 CUDA_HOST_DEVICE Vector(const Vector<T_, n_, sx_>& x)
248 {
249 for(int i(0); i < n_; ++i)
250 {
251 v[i] = x.v[i];
252 }
253 }
254
256 template<typename Tx_, int sx_>
257 CUDA_HOST_DEVICE explicit Vector(const Vector<Tx_, n_, sx_>& x)
258 {
259 for(int i(0); i < n_; ++i)
260 {
261 v[i] = ValueType(x.v[i]);
262 }
263 }
264
266 template<typename Tx_>
267 CUDA_HOST_DEVICE explicit Vector(const std::array<Tx_, n_>& x)
268 {
269 for(int i(0); i < n_; ++i)
270 {
271 v[i] = ValueType(x[i]);
272 }
273 }
274
276 template<typename Tx_, int sx_>
277 CUDA_HOST_DEVICE static Vector convert_new(const Vector<Tx_, n_, sx_>& x)
278 {
279 Vector v;
280 for(int i(0); i < n_; ++i)
281 {
282 if constexpr(std::is_same<T_, DataType>::value)
283 v[i] = T_(x.v[i]);
284 else
285 v[i] = T_::convert(x.v[i]);
286 }
287 return v;
288 }
289
291 CUDA_HOST_DEVICE static Vector convert_new(Vector&& x)
292 {
293 return std::move(x);
294 }
295
308 template<typename Tx_>
309 CUDA_HOST_DEVICE Vector(const std::initializer_list<Tx_>& x)
310 {
311 XASSERTM(std::size_t(n_) == x.size(), "invalid initializer list size");
312 auto it(x.begin());
313 for(int i(0); i < n_; ++i, ++it)
314 v[i] = T_(*it);
315 }
316
318 CUDA_HOST_DEVICE Vector& operator=(DataType value)
319 {
320 for(int i(0); i < n_; ++i)
321 {
322 v[i] = value;
323 }
324 return *this;
325 }
326
328 template<int sx_>
329 CUDA_HOST_DEVICE Vector& operator=(const Vector<T_, n_, sx_>& x)
330 {
331 for(int i(0); i < n_; ++i)
332 {
333 v[i] = x.v[i];
334 }
335 return *this;
336 }
337
339 template<typename Tx_, int sx_>
340 CUDA_HOST_DEVICE Vector& operator=(const Vector<Tx_, n_, sx_>& x)
341 {
342 for(int i(0); i < n_; ++i)
343 {
344 v[i] = ValueType(x.v[i]);
345 }
346 return *this;
347 }
348
350 template<typename Tx_>
351 CUDA_HOST_DEVICE Vector& operator=(const std::array<Tx_, n_>& x)
352 {
353 for(int i(0); i < n_; ++i)
354 {
355 v[i] = ValueType(x[std::size_t(i)]);
356 }
357 return *this;
358 }
359
374 template<typename Tx_>
375 CUDA_HOST_DEVICE Vector& operator=(const std::initializer_list<Tx_>& x)
376 {
377 XASSERTM(std::size_t(n_) == x.size(), "invalid initializer list size");
378 auto it(x.begin());
379 for(int i(0); i < n_; ++i, ++it)
380 v[i] = T_(*it);
381 return *this;
382 }
383
385 template<typename Tx_, int sx_>
386 CUDA_HOST_DEVICE void convert(const Vector<Tx_, n_, sx_>& x)
387 {
388 for(int i(0); i < n_; ++i)
389 v[i] = T_(x.v[i]);
390 }
391
400 CUDA_HOST_DEVICE T_& operator()(int i)
401 {
402 ASSERTM((i >= 0) && (i < n_), "index i out-of-bounds");
403 return v[i];
404 }
405
407 CUDA_HOST_DEVICE const T_& operator()(int i) const
408 {
409 ASSERTM((i >= 0) && (i < n_), "index i out-of-bounds");
410 return v[i];
411 }
412
414 CUDA_HOST_DEVICE T_& operator[](int i)
415 {
416 ASSERTM((i >= 0) && (i < n_), "index i out-of-bounds");
417 return v[i];
418 }
419
421 CUDA_HOST_DEVICE const T_& operator[](int i) const
422 {
423 ASSERTM((i >= 0) && (i < n_), "index i out-of-bounds");
424 return v[i];
425 }
426
433 template<int snx_>
434 CUDA_HOST_DEVICE void copy(const Vector<T_, n_, snx_>& x)
435 {
436 for(int i(0); i < n_; ++i)
437 {
438 v[i] = x.v[i];
439 }
440 }
441
451 template<int nn_, int nx_, int snx_>
452 CUDA_HOST_DEVICE void copy_n(const Vector<T_, nx_, snx_>& x)
453 {
454 static_assert(nn_ <= n_, "invalid copy_n size");
455 static_assert(nn_ <= nx_, "invalid copy_n size");
456 for(int i(0); i < nn_; ++i)
457 {
458 v[i] = x.v[i];
459 }
460 }
461
463 CUDA_HOST_DEVICE Vector& operator*=(DataType alpha)
464 {
465 for(int i(0); i < n_; ++i)
466 {
467 v[i] *= alpha;
468 }
469 return *this;
470 }
471
473 template <int sx_>
474 CUDA_HOST_DEVICE Vector& operator*=(const Vector<T_, n_, sx_>& x)
475 {
476 for(int i(0); i < n_; ++i)
477 {
478 v[i] *= x.v[i];
479 }
480 return *this;
481 }
482
484 template<int sx_>
485 CUDA_HOST_DEVICE Vector& operator+=(const Vector<T_, n_, sx_>& x)
486 {
487 for(int i(0); i < n_; ++i)
488 {
489 v[i] += x.v[i];
490 }
491 return *this;
492 }
493
495 template<int sx_>
496 CUDA_HOST_DEVICE Vector& operator-=(const Vector<T_, n_, sx_>& x)
497 {
498 for(int i(0); i < n_; ++i)
499 {
500 v[i] -= x.v[i];
501 }
502 return *this;
503 }
504
511 CUDA_HOST_DEVICE void format(DataType alpha = DataType(0))
512 {
513 for(int i(0); i < n_; ++i)
514 {
515 v[i] = alpha;
516 }
517 }
518
528 template<int nn_>
529 CUDA_HOST_DEVICE void format_n(DataType alpha = DataType(0))
530 {
531 static_assert(nn_ <= n_, "invalid format_n size");
532 for(int i(0); i < nn_; ++i)
533 {
534 v[i] = alpha;
535 }
536 }
537
544 CUDA_HOST_DEVICE void scale(DataType alpha)
545 {
546 for(int i(0); i < n_; ++i)
547 {
548 v[i] *= alpha;
549 }
550 }
551
561 template<int nn_>
562 CUDA_HOST_DEVICE void scale_n(DataType alpha)
563 {
564 static_assert(nn_ <= n_, "invalid scale_n size");
565 for(int i(0); i < nn_; ++i)
566 {
567 v[i] *= alpha;
568 }
569 }
570
576 CUDA_HOST_DEVICE Vector& normalize()
577 {
578 #ifndef __CUDACC__
579 const DataType norm2(this->norm_euclid());
580 ASSERTM(norm2 > Math::eps<DataType>(), "Trying to normalize a null vector!");
581 return ((*this) *= (DataType(1)/norm2));
582 #else
583 const DataType norm2_sqr(this->norm_euclid_sqr());
584 ASSERTM(norm2_sqr > CudaMath::cuda_get_eps<DataType>(), "Trying to normalize a null vector!");
585 return ((*this) *= CudaMath::cuda_rsqrt(norm2_sqr));
586 #endif
587 }
588
597 template<int nn_>
598 CUDA_HOST_DEVICE Vector& normalize_n()
599 {
600 static_assert(nn_ <= n_, "invalid normalize_n size");
601#ifndef __CUDACC__
602 const DataType norm2(this->template norm_euclid_n<nn_>());
603 ASSERTM(norm2 > Math::eps<DataType>(), "Trying to normalize a null vector!");
604 this->template scale_n<nn_>(DataType(1)/norm2);
605 return *this;
606#else
607 const DataType norm2_sqr(this->template norm_euclid_sqr_n<nn_>());
608 ASSERTM(norm2_sqr > CudaMath::cuda_get_eps<DataType>(), "Trying to normalize a null vector!");
609 this->template scale_n<nn_>(CudaMath::cuda_rsqrt(norm2_sqr));
610 return *this;
611#endif
612 }
613
619 CUDA_HOST_DEVICE Vector& negate()
620 {
621 for(int i(0); i < n_; ++i)
622 v[i] = -v[i];
623 return *this;
624 }
625
634 template<int nn_>
635 CUDA_HOST_DEVICE Vector& negate_n()
636 {
637 static_assert(nn_ <= n_, "invalid negate_n size");
638 for(int i(0); i < nn_; ++i)
639 v[i] = -v[i];
640 return *this;
641 }
642
654 template<int snx_>
655 CUDA_HOST_DEVICE Vector& axpy(DataType alpha, const Vector<T_, n_, snx_>& x)
656 {
657 for(int i(0); i < n_; ++i)
658 v[i] += alpha * x.v[i];
659 return *this;
660 }
661
676 template<int nn_, int nx_, int snx_>
677 CUDA_HOST_DEVICE Vector& axpy_n(DataType alpha, const Vector<T_, nx_, snx_>& x)
678 {
679 static_assert(nn_ <= n_, "invalid negate_n size");
680 static_assert(nn_ <= nx_, "invalid negate_n size");
681 for(int i(0); i < nn_; ++i)
682 v[i] += alpha * x.v[i];
683 return *this;
684 }
685
703 template<int sna_, int snb_>
704 CUDA_HOST_DEVICE Vector& set_convex(DataType alpha, const Vector<T_, n_, sna_>& a, const Vector<T_, n_, snb_>& b)
705 {
706 for(int i(0); i < n_; ++i)
707 v[i] = (T_(1) - alpha) * a.v[i] + alpha * b.v[i];
708 return *this;
709 }
710
731 template<int nn_, int na_, int nb_, int sna_, int snb_>
732 CUDA_HOST_DEVICE Vector& set_convex_n(DataType alpha, const Vector<T_, na_, sna_>& a, const Vector<T_, nb_, snb_>& b)
733 {
734 static_assert(nn_ <= n_, "invalid set_convex_n size");
735 static_assert(nn_ <= na_, "invalid set_convex_n size");
736 static_assert(nn_ <= nb_, "invalid set_convex_n size");
737 for(int i(0); i < nn_; ++i)
738 v[i] = (T_(1) - alpha) * a.v[i] + alpha * b.v[i];
739 return *this;
740 }
741
757 template<int m_, int sma_, int sna_, int sx_>
759 {
760 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
761 ASSERTM((const void*)this != (const void*)&x, "result vector and multiplicand vector 'x' must be different objects");
762
763 for(int i(0); i < n_; ++i)
764 {
765 v[i] = T_(0);
766 for(int j(0); j < m_; ++j)
767 {
768 v[i] += a.v[i][j] * x.v[j];
769 }
770 }
771 return *this;
772 }
773
793 template<int mm_, int nn_, int ma_, int na_, int sna_, int sma_, int nx_, int sx_>
795 {
796 static_assert(mm_ <= n_, "invalid set_mat_vec_mult_n size");
797 static_assert(mm_ <= ma_, "invalid set_mat_vec_mult_n size");
798 static_assert(nn_ <= nx_, "invalid set_mat_vec_mult_n size");
799 static_assert(nn_ <= na_, "invalid set_mat_vec_mult_n size");
800
801 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
802 ASSERTM((const void*)this != (const void*)&x, "result vector and multiplicand vector 'x' must be different objects");
803
804 for(int i(0); i < mm_; ++i)
805 {
806 v[i] = T_(0);
807 for(int j(0); j < nn_; ++j)
808 {
809 v[i] += a.v[i][j] * x.v[j];
810 }
811 }
812 return *this;
813 }
814
830 template<int m_, int sma_, int sna_, int sx_>
832 {
833 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
834 ASSERTM((const void*)this != (const void*)&x, "result vector and multiplicand vector 'x' must be different objects");
835
836 for(int j(0); j < n_; ++j)
837 {
838 v[j] = T_(0);
839 for(int i(0); i < m_; ++i)
840 {
841 v[j] += a.v[i][j] * x.v[i];
842 }
843 }
844 return *this;
845 }
846
866 template<int nn_, int mm_, int mx_, int smx_, int ma_, int na_, int sma_, int sna_>
868 {
869 static_assert(mm_ <= mx_, "invalid set_mat_vec_mult_n size");
870 static_assert(mm_ <= ma_, "invalid set_mat_vec_mult_n size");
871 static_assert(nn_ <= n_, "invalid set_mat_vec_mult_n size");
872 static_assert(nn_ <= na_, "invalid set_mat_vec_mult_n size");
873
874 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
875 ASSERTM((const void*)this != (const void*)&x, "result vector and multiplicand vector 'x' must be different objects");
876
877 for(int j(0); j < nn_; ++j)
878 {
879 v[j] = T_(0);
880 for(int i(0); i < mm_; ++i)
881 {
882 v[j] += a.v[i][j] * x.v[i];
883 }
884 }
885 return *this;
886 }
887
906 template<int m_, int sma_, int sna_, int sx_>
908 {
909 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
910 ASSERTM((const void*)this != (const void*)&x, "result vector and multiplicand vector 'x' must be different objects");
911
912 for(int i(0); i < n_; ++i)
913 {
914 for(int j(0); j < m_; ++j)
915 {
916 v[i] += alpha * a.v[i][j] * x.v[j];
917 }
918 }
919 return *this;
920 }
921
946 template<int mm_, int nn_, int ma_, int na_, int sna_, int sma_, int nx_, int sx_>
948 {
949 static_assert(mm_ <= n_, "invalid add_mat_vec_mult_n size");
950 static_assert(mm_ <= ma_, "invalid add_mat_vec_mult_n size");
951 static_assert(nn_ <= nx_, "invalid add_mat_vec_mult_n size");
952 static_assert(nn_ <= na_, "invalid add_mat_vec_mult_n size");
953
954 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
955 ASSERTM((const void*)this != (const void*)&x, "result vector and multiplicand vector 'x' must be different objects");
956
957 for(int i(0); i < mm_; ++i)
958 {
959 for(int j(0); j < nn_; ++j)
960 {
961 v[i] += alpha * a.v[i][j] * x.v[j];
962 }
963 }
964 return *this;
965 }
966
985 template<int m_, int sma_, int sna_, int sx_>
987 {
988 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
989 ASSERTM((const void*)this != (const void*)&x, "result vector and multiplicand vector 'x' must be different objects");
990
991 for(int j(0); j < n_; ++j)
992 {
993 for(int i(0); i < m_; ++i)
994 {
995 v[j] += alpha * a.v[i][j] * x.v[i];
996 }
997 }
998 return *this;
999 }
1000
1025 template<int nn_, int mm_, int mx_, int smx_, int ma_, int na_, int sma_, int sna_>
1027 {
1028 static_assert(mm_ <= mx_, "invalid add_vec_mat_mult_n size");
1029 static_assert(mm_ <= ma_, "invalid add_vec_mat_mult_n size");
1030 static_assert(nn_ <= n_, "invalid add_vec_mat_mult_n size");
1031 static_assert(nn_ <= na_, "invalid add_vec_mat_mult_n size");
1032
1033 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
1034 ASSERTM((const void*)this != (const void*)&x, "result vector and multiplicand vector 'x' must be different objects");
1035
1036 for(int j(0); j < nn_; ++j)
1037 {
1038 for(int i(0); i < mm_; ++i)
1039 {
1040 v[j] += alpha * a.v[i][j] * x.v[i];
1041 }
1042 }
1043 return *this;
1044 }
1045
1052 CUDA_HOST_DEVICE DataType norm_euclid_sqr() const
1053 {
1054 DataType r(DataType(0));
1055 for(int i(0); i < n_; ++i)
1056 {
1057 #ifndef __CUDACC__
1058 r += Math::sqr(v[i]);
1059 #else
1060 r += CudaMath::cuda_sqr(v[i]);
1061 #endif
1062 }
1063 return r;
1064 }
1065
1072 template<int nn_>
1073 CUDA_HOST_DEVICE DataType norm_euclid_sqr_n() const
1074 {
1075 static_assert(nn_ <= n_, "invalid norm_euclid_sqr_n size");
1076 DataType r(DataType(0));
1077 for(int i(0); i < nn_; ++i)
1078 {
1079 #ifndef __CUDACC__
1080 r += Math::sqr(v[i]);
1081 #else
1082 r += CudaMath::cuda_sqr(v[i]);
1083 #endif
1084 }
1085 return r;
1086 }
1087
1094 CUDA_HOST_DEVICE DataType norm_euclid() const
1095 {
1096 #ifndef __CUDACC__
1097 return Math::sqrt(norm_euclid_sqr());
1098 #else
1099 return CudaMath::cuda_sqrt(norm_euclid_sqr());
1100 #endif
1101 }
1102
1109 template<int nn_>
1110 CUDA_HOST_DEVICE DataType norm_euclid_n() const
1111 {
1112 static_assert(nn_ <= n_, "invalid norm_euclid_n size");
1113 #ifndef __CUDACC__
1114 return Math::sqrt(this->template norm_euclid_sqr_n<nn_>());
1115 #else
1116 return CudaMath::cuda_sqrt(this->template norm_euclid_sqr_n<nn_>());
1117 #endif
1118 }
1119
1126 CUDA_HOST_DEVICE DataType norm_l1() const
1127 {
1128 DataType r(DataType(0));
1129 for(int i(0); i < n_; ++i)
1130 {
1131 #ifndef __CUDACC__
1132 r += Math::abs(v[i]);
1133 #else
1134 r += CudaMath::cuda_abs(v[i]);
1135 #endif
1136 }
1137 return r;
1138 }
1139
1146 template<int nn_>
1147 CUDA_HOST_DEVICE DataType norm_l1_n() const
1148 {
1149 static_assert(nn_ <= n_, "invalid norm_l1_n size");
1150 DataType r(DataType(0));
1151 for(int i(0); i < nn_; ++i)
1152 {
1153 #ifndef __CUDACC__
1154 r += Math::abs(v[i]);
1155 #else
1156 r += CudaMath::cuda_abs(v[i]);
1157 #endif
1158 }
1159 return r;
1160 }
1161
1168 CUDA_HOST_DEVICE DataType norm_max() const
1169 {
1170 DataType r(DataType(0));
1171 for(int i(0); i < n_; ++i)
1172 {
1173 #ifndef __CUDACC__
1174 r = Math::max(r, Math::abs(v[i]));
1175 #else
1176 r = CudaMath::cuda_max(r, CudaMath::cuda_abs(v[i]));
1177 #endif
1178 }
1179 return r;
1180 }
1181
1188 template<int nn_>
1189 CUDA_HOST_DEVICE DataType norm_max_n() const
1190 {
1191 static_assert(nn_ <= n_, "invalid norm_l1_n size");
1192 DataType r(DataType(0));
1193 for(int i(0); i < nn_; ++i)
1194 {
1195 #ifndef __CUDACC__
1196 r = Math::max(r, Math::abs(v[i]));
1197 #else
1198 r = CudaMath::cuda_max(r, CudaMath::cuda_abs(v[i]));
1199 #endif
1200 }
1201 return r;
1202 }
1203
1207 CUDA_HOST_DEVICE static Vector null()
1208 {
1209 return Vector(DataType(0));
1210 }
1211
1215 CUDA_HOST_DEVICE bool normalized() const
1216 {
1218 }
1219
1226 CUDA_HOST friend std::ostream & operator<< (std::ostream & lhs, const Vector & b)
1227 {
1228 lhs << "[";
1229 for (int i(0) ; i < b.n ; ++i)
1230 {
1231 lhs << " " << stringify(b(i));
1232 }
1233 lhs << " ]";
1234
1235 return lhs;
1236 }
1237
1238 CUDA_HOST friend std::istream& operator>>(std::istream& in, Vector& vector)
1239 {
1240 // Ignore all input until opening bracket
1241 in.ignore(std::numeric_limits<std::streamsize>::max(), '[');
1242
1243 for(int i(0); i < Vector::n; ++i)
1244 {
1245 in >> vector(i);
1246 }
1247
1248 // Ignore all input until closing bracket is consumed
1249 in.ignore(std::numeric_limits<std::streamsize>::max(), ']');
1250
1251 return in;
1252 }
1253 }; // class Vector
1254
1255 template<typename T_, int sx_, int sa_>
1256 inline void cross(Vector<T_, 2, sx_>& x, const Vector<T_, 2, sa_>& a)
1257 {
1258 x.v[0] = a.v[1];
1259 x.v[1] = -a.v[0];
1260 }
1261
1262 template<typename T_, int sx_, int sa_, int sb_>
1263 inline void cross(Vector<T_, 3, sx_>& x, const Vector<T_, 3, sa_>& a, const Vector<T_, 3, sb_>& b)
1264 {
1265 x.v[0] = a.v[1]*b.v[2] - a.v[2]*b.v[1];
1266 x.v[1] = a.v[2]*b.v[0] - a.v[0]*b.v[2];
1267 x.v[2] = a.v[0]*b.v[1] - a.v[1]*b.v[0];
1268 }
1269
1271 template<typename T_, int n_, int s_>
1272 CUDA_HOST_DEVICE inline Vector<T_, n_> operator*(typename Vector<T_, n_>::DataType alpha, const Vector<T_, n_, s_>& x)
1273 {
1274 return Vector<T_, n_>(x) *= alpha;
1275 }
1276
1278 template<typename T_, int n_, int s_>
1279 CUDA_HOST_DEVICE inline Vector<T_, n_> operator*(const Vector<T_, n_, s_>& x, typename Vector<T_, n_>::DataType alpha)
1280 {
1281 return Vector<T_, n_>(x) *= alpha;
1282 }
1283
1285 template<typename T_, int n_, int sa_, int sb_>
1287 {
1288 return Vector<T_, n_>(a) *= b;
1289 }
1290
1292 template<typename T_, int n_, int sa_, int sb_>
1293 CUDA_HOST_DEVICE inline Vector<T_, n_> operator+(const Vector<T_, n_, sa_>& a, const Vector<T_, n_, sb_>& b)
1294 {
1295 return Vector<T_, n_>(a) += b;
1296 }
1297
1299 template<typename T_, int n_, int sa_, int sb_>
1300 CUDA_HOST_DEVICE inline Vector<T_, n_> operator-(const Vector<T_, n_, sa_>& a, const Vector<T_, n_, sb_>& b)
1301 {
1302 return Vector<T_, n_>(a) -= b;
1303 }
1304
1311 template<typename T_>
1312 CUDA_HOST inline T_ calculate_opening_angle(const Vector<T_,2>& x, const Vector<T_, 2>& y)
1313 {
1314 #ifdef __CUDACC__
1315 XABORTM("calculate_opening_angle not implemented for CUDA");
1316 return T_(0);
1317 #else
1318 return Math::calc_opening_angle(x[0], x[1], y[0], y[1]);
1319 #endif
1320 }
1321
1330 template<typename T_, int dim_>
1332 {
1333 T_ norm2(y.template norm_euclid_n<dim_>());
1334 if(norm2 < Math::eps<T_>())
1335 norm2 = T_(1);
1336 const auto tmp_normalized = (T_(1)/norm2) * y;
1337 return dot(x, tmp_normalized) * tmp_normalized;
1338 }
1339
1340
1341 /* ************************************************************************************************************* */
1342 /* ************************************************************************************************************* */
1343 // Tiny Matrix implementation
1344 /* ************************************************************************************************************* */
1345 /* ************************************************************************************************************* */
1346
1347 template<typename T_, int m_, int n_, int sm_, int sn_>
1348 class Matrix
1349 {
1350 static_assert(m_ > 0, "invalid row count");
1351 static_assert(n_ > 0, "invalid column count");
1352 static_assert(sm_ >= m_, "invalid row stride");
1353 static_assert(sn_ >= n_, "invalid column stride");
1354
1355 public:
1357 static constexpr int m = m_;
1359 static constexpr int n = n_;
1361 static constexpr int sm = sm_;
1363 static constexpr int sn = sn_;
1364
1366 typedef T_ ValueType;
1368 typedef typename Intern::DataTypeExtractor<ValueType>::MyDataType DataType;
1369
1373 RowType v[sm_];
1374
1376 CUDA_HOST_DEVICE Matrix()
1377 {
1378 }
1379
1381 CUDA_HOST_DEVICE explicit Matrix(DataType value)
1382 {
1383 for(int i(0); i < m_; ++i)
1384 {
1385 v[i] = value;
1386 }
1387 }
1388
1390 template<typename T2_, int sma_, int sna_>
1391 CUDA_HOST_DEVICE Matrix(const Matrix<T2_, m_, n_, sma_, sna_>& a)
1392 {
1393 for(int i(0); i < m_; ++i)
1394 {
1395 for(int j(0); j < n_; ++j)
1396 {
1397 v[i][j] = T_(a.v[i][j]);
1398 }
1399 }
1400 }
1401
1413 template<typename Tx_>
1414 CUDA_HOST_DEVICE explicit Matrix(const std::initializer_list<Tx_>& x)
1415 {
1416 XASSERTM(std::size_t(m_) == x.size(), "invalid initializer list size");
1417 auto it(x.begin());
1418 for(int i(0); i < m_; ++i, ++it)
1419 v[i] = *it;
1420 }
1421
1435 template<typename Tx_>
1436 CUDA_HOST_DEVICE explicit Matrix(const std::initializer_list<std::initializer_list<Tx_>>& x)
1437 {
1438 XASSERTM(std::size_t(m_) == x.size(), "invalid initializer list size");
1439 auto it(x.begin());
1440 for(int i(0); i < m_; ++i, ++it)
1441 v[i] = *it;
1442 }
1443
1445 CUDA_HOST_DEVICE Matrix& operator=(DataType value)
1446 {
1447 for(int i(0); i < m_; ++i)
1448 {
1449 v[i] = value;
1450 }
1451 return *this;
1452 }
1453
1455 template<int sma_, int sna_>
1457 {
1458 for(int i(0); i < m_; ++i)
1459 {
1460 v[i] = a.v[i];
1461 }
1462 return *this;
1463 }
1464
1477 template<typename Tx_>
1478 CUDA_HOST_DEVICE Matrix& operator=(const std::initializer_list<Tx_>& x)
1479 {
1480 XASSERTM(std::size_t(m_) == x.size(), "invalid initializer list size");
1481 auto it(x.begin());
1482 for(int i(0); i < m_; ++i, ++it)
1483 v[i] = *it;
1484 return *this;
1485 }
1486
1498 template<typename Tx_>
1499 CUDA_HOST_DEVICE Matrix& operator=(const std::initializer_list<std::initializer_list<Tx_>>& x)
1500 {
1501 XASSERTM(std::size_t(m_) == x.size(), "invalid initializer list size");
1502 auto it(x.begin());
1503 for(int i(0); i < m_; ++i, ++it)
1504 v[i] = *it;
1505 return *this;
1506 }
1507
1509 template<typename Tx_, int sma_, int sna_>
1510 CUDA_HOST_DEVICE void convert(const Matrix<Tx_, m_, n_, sma_, sna_>& a)
1511 {
1512 for(int i(0); i < m_; ++i)
1513 v[i].convert(a.v[i]);
1514 }
1515
1525 CUDA_HOST_DEVICE T_& operator()(int i, int j)
1526 {
1527 ASSERTM( (i >= 0) && (i < m_), "index i out-of-bounds");
1528 ASSERTM( (j >= 0) && (j < n_), "index j out-of-bounds");
1529 return v[i][j];
1530 }
1531
1533 CUDA_HOST_DEVICE const T_& operator()(int i, int j) const
1534 {
1535 ASSERTM( (i >= 0) && (i < m_), "index i out-of-bounds");
1536 ASSERTM( (j >= 0) && (j < n_), "index j out-of-bounds");
1537 return v[i][j];
1538 }
1539
1549 CUDA_HOST_DEVICE RowType& operator[](int i)
1550 {
1551 ASSERTM( (i >= 0) && (i <m_), "index i out-of-bounds");
1552 return v[i];
1553 }
1554
1556 CUDA_HOST_DEVICE const RowType& operator[](int i) const
1557 {
1558 ASSERTM( (i >= 0) && (i <m_), "index i out-of-bounds");
1559 return v[i];
1560 }
1561
1563 CUDA_HOST_DEVICE Matrix& operator*=(DataType alpha)
1564 {
1565 for(int i(0); i < m_; ++i)
1566 {
1567 v[i] *= alpha;
1568 }
1569 return *this;
1570 }
1571
1573 template<int sma_, int sna_>
1575 {
1576 for(int i(0); i < m_; ++i)
1577 {
1578 v[i] += a.v[i];
1579 }
1580 return *this;
1581 }
1582
1584 template<int sma_, int sna_>
1586 {
1587 for(int i(0); i < m_; ++i)
1588 {
1589 v[i] -= a.v[i];
1590 }
1591 return *this;
1592 }
1593
1600 template<int sma_, int sna_>
1601 CUDA_HOST_DEVICE void copy(const Matrix<T_, m_, n_, sma_, sna_>& a)
1602 {
1603 for(int i(0); i < m_; ++i)
1604 for(int j(0); j < n_; ++j)
1605 v[i][j] = a.v[i][j];
1606 }
1607
1620 template<int mm_, int nn_, int ma_, int na_, int sma_, int sna_>
1621 CUDA_HOST_DEVICE void copy_n(const Matrix<T_, ma_, na_, sma_, sna_>& a)
1622 {
1623 static_assert(mm_ <= m_, "invalid copy_n size");
1624 static_assert(mm_ <= ma_, "invalid copy_n size");
1625 static_assert(nn_ <= n_, "invalid copy_n size");
1626 static_assert(nn_ <= na_, "invalid copy_n size");
1627 for(int i(0); i < mm_; ++i)
1628 for(int j(0); j < nn_; ++j)
1629 v[i][j] = a.v[i][j];
1630 }
1631
1638 CUDA_HOST_DEVICE void format(DataType alpha = DataType(0))
1639 {
1640 for(int i(0); i < m_; ++i)
1641 {
1642 v[i].format(alpha);
1643 }
1644 }
1645
1657 CUDA_HOST_DEVICE DataType norm_hessian_sqr() const
1658 {
1659 DataType r(0);
1660 for(int i(0); i < m_; ++i)
1661 {
1662 #ifndef __CUDACC__
1663 r += Math::sqr(v[i][i]);
1664 #else
1665 r += CudaMath::cuda_sqr(v[i][i]);
1666 #endif
1667 for(int j(0); j < n_; ++j)
1668 {
1669 #ifndef __CUDACC__
1670 r += Math::sqr(v[i][j]);
1671 #else
1672 r += CudaMath::cuda_sqr(v[i][j]);
1673 #endif
1674 }
1675 }
1676 return r / DataType(2);
1677 }
1678
1687 CUDA_HOST_DEVICE DataType norm_frobenius() const
1688 {
1689 #ifndef __CUDACC__
1690 return Math::sqrt(norm_frobenius_sqr());
1691 #else
1692 return CudaMath::cuda_sqrt(norm_frobenius_sqr());
1693 #endif
1694 }
1695
1704 CUDA_HOST_DEVICE DataType norm_frobenius_sqr() const
1705 {
1706 DataType r(0);
1707 for(int i(0); i < m_; ++i)
1708 {
1709 for(int j(0); j < n_; ++j)
1710 {
1711 #ifndef __CUDACC__
1712 r += Math::sqr(v[i][j]);
1713 #else
1714 r += CudaMath::cuda_sqr(v[i][j]);
1715 #endif
1716 }
1717 }
1718 return r;
1719 }
1720
1726 CUDA_HOST_DEVICE DataType norm_sub_id_frobenius() const
1727 {
1728 DataType r(0);
1729 for(int i(0); i < m_; ++i)
1730 {
1731 for(int j(0); j < n_; ++j)
1732 {
1733 #ifndef __CUDACC__
1734 r += Math::sqr(v[i][j] - DataType(i == j ? 1 : 0));
1735 #else
1736 r += CudaMath::cuda_sqr(v[i][j] - DataType(i == j ? 1 : 0));
1737 #endif
1738 }
1739 }
1740 #ifndef __CUDACC__
1741 return Math::sqrt(r);
1742 #else
1743 return CudaMath::cuda_sqrt(r);
1744 #endif
1745 }
1746
1756 CUDA_HOST_DEVICE DataType trace() const
1757 {
1758 #ifndef __CUDACC__
1759 int k = Math::min(m_, n_);
1760 #else
1761 int k = CudaMath::cuda_min(m_, n_);
1762 #endif
1763 DataType r(0);
1764 for(int i(0); i < k; ++i)
1765 {
1766 r += v[i][i];
1767 }
1768 return r;
1769 }
1770
1778 CUDA_HOST_DEVICE DataType det() const
1779 {
1780 return Intern::DetHelper<m_, n_>::compute(*this);
1781 }
1782
1794 CUDA_HOST_DEVICE DataType vol() const
1795 {
1796 return Intern::VolHelper<m_, n_>::compute(*this);
1797 }
1798
1809 template<int sma_, int sna_>
1811 {
1812 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
1813 ASSERTM((const void*)this != (const void*)&a, "result matrix and input matrix 'a' must be different objects");
1814 Intern::InverseHelper<m_, n_>::compute(*this, a);
1815 return *this;
1816 }
1817
1818 #ifdef __CUDACC__
1834 template<typename ThreadGroup_, int sma_, int sna_>
1835 CUDA_HOST_DEVICE __forceinline__ Matrix& grouped_set_inverse(const ThreadGroup_& tg, const Matrix<T_, m_, n_, sma_, sna_>& a, const T_& det)
1836 {
1837 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
1838 ASSERTM((const void*)this != (const void*)&a, "result matrix and input matrix 'a' must be different objects");
1839 Intern::CudaGroupedInverseHelper<m_, n_>::compute(tg, *this, a, det);
1840 return *this;
1841 }
1842 #endif
1843
1861 template<int sma_, int sna_>
1863 {
1864 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
1865 ASSERTM((const void*)this != (const void*)&a, "result matrix and input matrix 'a' must be different objects");
1866 Intern::CofactorHelper<m_, n_>::compute(*this, a);
1867 return *this;
1868 }
1869
1878 template<int sma_, int sna_>
1880 {
1881 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
1882 ASSERTM((const void*)this != (const void*)&a, "result matrix and input matrix 'a' must be different objects");
1883
1884 for(int i(0); i < m_; ++i)
1885 {
1886 for(int j(0); j < n_; ++j)
1887 {
1888 v[i][j] = a.v[j][i];
1889 }
1890 }
1891 return *this;
1892 }
1893
1906 template<int l_, int sla_, int sna_>
1908 {
1909 static_assert(m_ == n_, "Gram matrices must be square");
1910
1911 format();
1912
1913 for(int k(0); k < l_; ++k)
1914 {
1915 for(int i(0); i < n_; ++i)
1916 {
1917 for(int j(0); j < n_; ++j)
1918 {
1919 v[i][j] += a.v[k][i] * a.v[k][j];
1920 }
1921 }
1922 }
1923
1924 return *this;
1925 }
1926
1942 template<int snx_, int sny_>
1943 CUDA_HOST_DEVICE DataType scalar_product(const Vector<T_, m_, snx_>& x, const Vector<T_, n_, sny_>& y) const
1944 {
1945 DataType r(DataType(0));
1946 for(int i(0); i < m_; ++i)
1947 {
1948 r += x[i] * dot(v[i], y);
1949 }
1950 return r;
1951 }
1952
1970 template<int snx_, int sny_>
1971 CUDA_HOST_DEVICE Matrix& add_outer_product(
1972 const Vector<T_, m_, snx_>& x,
1973 const Vector<T_, n_, sny_>& y,
1974 const DataType alpha = DataType(1))
1975 {
1976 for(int i(0); i < m_; ++i)
1977 {
1978 for(int j(0); j < n_; ++j)
1979 {
1980 v[i][j] += alpha * x[i] * y[j];
1981 }
1982 }
1983 return *this;
1984 }
1985
2000 template<int snx_, int sny_>
2001 CUDA_HOST_DEVICE Matrix& set_outer_product(
2002 const Vector<T_, m_, snx_>& x,
2003 const Vector<T_, n_, sny_>& y)
2004 {
2005 for(int i(0); i < m_; ++i)
2006 {
2007 for(int j(0); j < n_; ++j)
2008 {
2009 v[i][j] = x[i] * y[j];
2010 }
2011 }
2012 return *this;
2013 }
2014
2026 template<int sma_, int sna_>
2027 CUDA_HOST_DEVICE Matrix& axpy(DataType alpha, const Matrix<T_, m_, n_, sma_, sna_>& a)
2028 {
2029 for(int i(0); i < m_; ++i)
2030 {
2031 for(int j(0); j < n_; ++j)
2032 {
2033 v[i][j] += alpha * a.v[i][j];
2034 }
2035 }
2036 return *this;
2037 }
2038
2045 CUDA_HOST_DEVICE Matrix& add_scalar_main_diag(DataType alpha)
2046 {
2047 for(int i(0); (i < m_) && (i < n_); ++i)
2048 v[i][i] += alpha;
2049 return *this;
2050 }
2051
2070 template<int la_, int lb_, int sma_, int sna_, int smb_, int snb_>
2071 CUDA_HOST_DEVICE Matrix& add_mat_mat_mult(
2074 DataType alpha = DataType(1))
2075 {
2076 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
2077 ASSERTM((const void*)this != (const void*)&a, "result matrix and multiplicand matrix 'a' must be different objects");
2078 ASSERTM((const void*)this != (const void*)&b, "result matrix and multiplicand matrix 'b' must be different objects");
2079 ASSERTM(la_ == lb_, "second dimension of a must be equal to first dimension of b");
2080
2081 for(int i(0); i < m_; ++i)
2082 {
2083 for(int j(0); j < n_; ++j)
2084 {
2085 DataType r(0);
2086 for(int k(0); k < la_; ++k)
2087 {
2088 r += a.v[i][k] * b.v[k][j];
2089 }
2090 v[i][j] += alpha * r;
2091 }
2092 }
2093 return *this;
2094 }
2095
2111 template<int la_, int lb_, int sma_, int sna_, int smb_, int snb_>
2113 {
2114 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
2115 ASSERTM((const void*)this != (const void*)&a, "result matrix and multiplicand matrix 'a' must be different objects");
2116 ASSERTM((const void*)this != (const void*)&b, "result matrix and multiplicand matrix 'b' must be different objects");
2117 ASSERTM(la_ == lb_, "second dimension of a must be equal to first dimension of b");
2118
2119 format();
2120 return add_mat_mat_mult(a, b);
2121 }
2122
2144 template<int k_, int l_, int sma_, int sna_, int smb_, int snb_, int smd_, int snd_>
2145 CUDA_HOST_DEVICE Matrix& add_double_mat_mult(
2149 DataType alpha = DataType(1))
2150 {
2151 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
2152 ASSERTM((const void*)this != (const void*)&a, "result matrix and multiplicand matrix 'a' must be different objects");
2153 ASSERTM((const void*)this != (const void*)&b, "result matrix and multiplicand matrix 'b' must be different objects");
2154 ASSERTM((const void*)this != (const void*)&d, "result matrix and multiplicand matrix 'd' must be different objects");
2155
2156 for(int i(0); i < m_; ++i)
2157 {
2158 for(int j(0); j < n_; ++j)
2159 {
2160 DataType r(0);
2161 for(int p(0); p < k_; ++p)
2162 {
2163 DataType t(0);
2164 for(int q(0); q < l_; ++q)
2165 {
2166 t += a(p,q) * d(q,j);
2167 }
2168 r += b(p,i)*t;
2169 }
2170 v[i][j] += alpha * r;
2171 }
2172 }
2173 return *this;
2174 }
2175
2197 template<int k_, int l_, int sma_, int sna_, int smb_, int snb_, int smd_, int snd_>
2198 CUDA_HOST_DEVICE Matrix& set_double_mat_mult(
2202 T_ alpha = T_(1))
2203 {
2204 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
2205 ASSERTM((const void*)this != (const void*)&a, "result matrix and multiplicand matrix 'a' must be different objects");
2206 ASSERTM((const void*)this != (const void*)&b, "result matrix and multiplicand matrix 'b' must be different objects");
2207 ASSERTM((const void*)this != (const void*)&d, "result matrix and multiplicand matrix 'd' must be different objects");
2208
2209 format();
2210 return add_double_mat_mult(a, b, d, alpha);
2211 }
2212
2233 template<int l_, int snv_, int slt_, int smt_, int snt_>
2234 CUDA_HOST_DEVICE Matrix& add_vec_tensor_mult(
2235 const Vector<T_, l_, snv_>& x,
2237 DataType alpha = DataType(1))
2238 {
2239 for(int i(0); i < m_; ++i)
2240 {
2241 for(int j(0); j < n_; ++j)
2242 {
2243 DataType r(0);
2244 for(int k(0); k < l_; ++k)
2245 {
2246 r += x(k) * t(k,i,j);
2247 }
2248 v[i][j] += alpha * r;
2249 }
2250 }
2251 return *this;
2252 }
2253
2274 template<int l_, int snv_, int slt_, int smt_, int snt_>
2275 CUDA_HOST_DEVICE Matrix& set_vec_tensor_mult(
2276 const Vector<T_, l_, snv_>& x,
2278 DataType alpha = DataType(1))
2279 {
2280 format();
2281 return add_vec_tensor_mult(x, t, alpha);
2282 }
2283
2289 CUDA_HOST_DEVICE Matrix& set_identity()
2290 {
2291 for(int i(0); i < m_; ++i)
2292 for(int j(0); j < n_; ++j)
2293 v[i][j] = (i == j ? T_(1) : T_(0));
2294 return *this;
2295 }
2296
2305 CUDA_HOST_DEVICE Matrix& set_rotation_2d(T_ angle)
2306 {
2307 static_assert((m_ == 2) && (n_ == 2), "this function works only for 2x2 matrices");
2308 #ifndef __CUDACC__
2309 v[0][0] = (v[1][1] = Math::cos(angle));
2310 v[0][1] = -(v[1][0] = Math::sin(angle));
2311 #else
2312 v[0][0] = (v[1][1] = CudaMath::cuda_cos(angle));
2313 v[0][1] = -(v[1][0] = CudaMath::cuda_sin(angle));
2314 #endif
2315 return *this;
2316 }
2317
2332 CUDA_HOST_DEVICE Matrix& set_rotation_3d(T_ yaw, T_ pitch, T_ roll)
2333 {
2334 static_assert((m_ == 3) && (n_ == 3), "this function works only for 3x3 matrices");
2335 #ifndef __CUDACC__
2336 const T_ cy = Math::cos(yaw);
2337 const T_ sy = Math::sin(yaw);
2338 const T_ cp = Math::cos(pitch);
2339 const T_ sp = Math::sin(pitch);
2340 const T_ cr = Math::cos(roll);
2341 const T_ sr = Math::sin(roll);
2342 #else
2343 const T_ cy = CudaMath::cuda_cos(yaw);
2344 const T_ sy = CudaMath::cuda_sin(yaw);
2345 const T_ cp = CudaMath::cuda_cos(pitch);
2346 const T_ sp = CudaMath::cuda_sin(pitch);
2347 const T_ cr = CudaMath::cuda_cos(roll);
2348 const T_ sr = CudaMath::cuda_sin(roll);
2349 #endif
2350 v[0][0] = cy*cp;
2351 v[0][1] = cy*sp*sr - sy*cr;
2352 v[0][2] = cy*sp*cr + sy*sr;
2353 v[1][0] = sy*cp;
2354 v[1][1] = sy*sp*sr + cy*cr;
2355 v[1][2] = sy*sp*cr - cy*sr;
2356 v[2][0] = -sp;
2357 v[2][1] = cp*sr;
2358 v[2][2] = cp*cr;
2359 return *this;
2360 }
2361
2365 CUDA_HOST_DEVICE static Matrix null()
2366 {
2367 return Matrix(DataType(0));
2368 }
2369
2376 CUDA_HOST friend std::ostream & operator<< (std::ostream & lhs, const Matrix& A)
2377 {
2378 for (int i(0) ; i < m-1 ; ++i)
2379 {
2380 lhs << A[i] << "\n";
2381 }
2382 lhs << A[m-1];
2383
2384 return lhs;
2385 }
2386
2393 CUDA_HOST friend std::istream& operator>>(std::istream & in, Matrix& A)
2394 {
2395 for (int i(0) ; i < m; ++i)
2396 {
2397 in >> A[i];
2398 }
2399
2400 return in;
2401 }
2402 }; // class Matrix
2403
2413 template<typename T_, int mx_, int smx_, int ma_, int na_, int sm_, int sn_>
2415 {
2416 static_assert(mx_ >= 2, "invalid nu vector size for orthogonal_2x1");
2417 static_assert(ma_ >= 2, "invalid matrix row size for orthogonal_2x1");
2418 static_assert(na_ >= 1, "invalid matrix column size for orthogonal_2x1");
2419
2420 // 2d "cross" product. The sign has to be on the second component so the input is rotated in negative direction
2421 nu[0] = tau[1][0];
2422 nu[1] = -tau[0][0];
2423 }
2424
2434 template<typename T_, int mx_, int smx_, int ma_, int na_, int sm_, int sn_>
2436 {
2437 static_assert(mx_ >= 3, "invalid nu vector size for orthogonal_3x2");
2438 static_assert(ma_ >= 3, "invalid matrix row size for orthogonal_3x2");
2439 static_assert(na_ >= 2, "invalid matrix column size for orthogonal_3x2");
2440
2441 // 3d cross product
2442 nu[0] = tau[1][0]*tau[2][1] - tau[2][0]*tau[1][1];
2443 nu[1] = tau[2][0]*tau[0][1] - tau[0][0]*tau[2][1];
2444 nu[2] = tau[0][0]*tau[1][1] - tau[1][0]*tau[0][1];
2445 }
2446
2447#ifdef DOXYGEN
2471 template<typename T_, int m_, int sm_, int sn_>
2473#endif
2474
2476 template<typename T_, int sm_, int sn_>
2477 CUDA_HOST_DEVICE Vector<T_, 2> orthogonal(const Matrix<T_, 2, 1, sm_, sn_>& tau)
2478 {
2479 Vector<T_, 2, sm_> nu(T_(0));
2480 orthogonal_2x1(nu, tau);
2481 return nu;
2482 }
2483
2484 template<typename T_, int sm_, int sn_>
2485 CUDA_HOST_DEVICE Vector<T_, 3> orthogonal(const Matrix<T_, 3, 2, sm_, sn_>& tau)
2486 {
2487 Vector<T_, 3, sm_> nu(T_(0));
2488 orthogonal_3x2(nu, tau);
2489 return nu;
2490 }
2492
2494 template<typename T_, int m_, int n_, int sm_, int sn_, int sx_>
2496 {
2497 return Vector<T_, m_>().set_mat_vec_mult(a, x);
2498 }
2499
2501 template<typename T_, int m_, int n_, int sm_, int sn_, int sx_>
2503 {
2504 return Vector<T_, n_>().set_vec_mat_mult(x, a);
2505 }
2506
2508 template<typename T_, int m_, int n_, int sm_, int sn_>
2510 {
2511 return Matrix<T_, m_, n_>(a) *= alpha;
2512 }
2513
2515 template<typename T_, int m_, int n_, int sm_, int sn_>
2517 {
2518 return Matrix<T_, m_, n_>(a) *= alpha;
2519 }
2520
2522 template<typename T_, int m_, int n_, int l_, int sma_, int sna_, int smb_, int snb_>
2524 {
2525 return Matrix<T_, m_, n_>().set_mat_mat_mult(a, b);
2526 }
2527
2529 template<typename T_, int m_, int n_,int sma_, int sna_, int smb_, int snb_>
2531 {
2532 return Matrix<T_, m_, n_>(a) += b;
2533 }
2534
2536 template<typename T_, int m_, int n_,int sma_, int sna_, int smb_, int snb_>
2538 {
2539 return Matrix<T_, m_, n_>(a) -= b;
2540 }
2541
2542 /* ************************************************************************************************************* */
2543 /* ************************************************************************************************************* */
2544 // Tiny Tensor3 implementation
2545 /* ************************************************************************************************************* */
2546 /* ************************************************************************************************************* */
2547
2548 template<typename T_, int l_, int m_, int n_, int sl_, int sm_, int sn_>
2549 class Tensor3
2550 {
2551 static_assert(l_ > 0, "invalid tube count");
2552 static_assert(m_ > 0, "invalid row count");
2553 static_assert(n_ > 0, "invalid column count");
2554 static_assert(sl_ >= l_, "invalid tube stride");
2555 static_assert(sm_ >= m_, "invalid row stride");
2556 static_assert(sn_ >= n_, "invalid column stride");
2557
2558 public:
2560 static constexpr int l = l_;
2562 static constexpr int m = m_;
2564 static constexpr int n = n_;
2566 static constexpr int sl = sl_;
2568 static constexpr int sm = sm_;
2570 static constexpr int sn = sn_;
2571
2573 typedef T_ ValueType;
2575 typedef typename Intern::DataTypeExtractor<ValueType>::MyDataType DataType;
2576
2581
2583 CUDA_HOST_DEVICE Tensor3()
2584 {
2585 }
2586
2588 CUDA_HOST_DEVICE explicit Tensor3(DataType value)
2589 {
2590 for(int i(0); i < l_; ++i)
2591 v[i] = value;
2592 }
2593
2600 template<typename Tx_>
2601 CUDA_HOST_DEVICE explicit Tensor3(const std::initializer_list<Tx_>& x)
2602 {
2603 XASSERTM(std::size_t(l_) == x.size(), "invalid initializer list size");
2604 auto it(x.begin());
2605 for(int i(0); i < l_; ++i, ++it)
2606 v[i] = *it;
2607 }
2608
2615 template<typename Tx_>
2616 CUDA_HOST_DEVICE explicit Tensor3(const std::initializer_list<std::initializer_list<std::initializer_list<Tx_>>>& x)
2617 {
2618 XASSERTM(std::size_t(l_) == x.size(), "invalid initializer list size");
2619 auto it(x.begin());
2620 for(int i(0); i < l_; ++i, ++it)
2621 v[i] = *it;
2622 }
2623
2625 template<int sla_, int sma_, int sna_>
2627 {
2628 for(int i(0); i < l_; ++i)
2629 v[i] = a.v[i];
2630 }
2631
2633 CUDA_HOST_DEVICE Tensor3& operator=(DataType value)
2634 {
2635 for(int i(0); i < l_; ++i)
2636 v[i] = value;
2637 return *this;
2638 }
2639
2641 template<int sla_, int sma_, int sna_>
2643 {
2644 for(int i(0); i < l_; ++i)
2645 v[i] = a.v[i];
2646 return *this;
2647 }
2648
2657 template<typename Tx_>
2658 CUDA_HOST_DEVICE Tensor3& operator=(const std::initializer_list<Tx_>& x)
2659 {
2660 XASSERTM(std::size_t(l_) == x.size(), "invalid initializer list size");
2661 auto it(x.begin());
2662 for(int i(0); i < l_; ++i, ++it)
2663 v[i] = *it;
2664 return *this;
2665 }
2666
2675 template<typename Tx_>
2676 CUDA_HOST_DEVICE Tensor3& operator=(const std::initializer_list<std::initializer_list<std::initializer_list<Tx_>>>& x)
2677 {
2678 XASSERTM(std::size_t(l_) == x.size(), "invalid initializer list size");
2679 auto it(x.begin());
2680 for(int i(0); i < l_; ++i, ++it)
2681 v[i] = *it;
2682 return *this;
2683 }
2684
2686 template<typename Tx_, int sla_, int sma_, int sna_>
2688 {
2689 for(int i(0); i < l_; ++i)
2690 v[i].convert(a.v[i]);
2691 }
2692
2702 CUDA_HOST_DEVICE T_& operator()(int h, int i, int j)
2703 {
2704 ASSERTM( (h >= 0) && (h < l_), "index h out-of-bounds");
2705 ASSERTM( (i >= 0) && (i < m_), "index i out-of-bounds");
2706 ASSERTM( (j >= 0) && (j < n_), "index j out-of-bounds");
2707 return v[h](i,j);
2708 }
2709
2711 CUDA_HOST_DEVICE const T_& operator()(int h, int i, int j) const
2712 {
2713 ASSERTM( (h >= 0) && (h < l_), "index h out-of-bounds");
2714 ASSERTM( (i >= 0) && (i < m_), "index i out-of-bounds");
2715 ASSERTM( (j >= 0) && (j < n_), "index j out-of-bounds");
2716 return v[h](i,j);
2717 }
2718
2728 CUDA_HOST_DEVICE PlaneType& operator[](int h)
2729 {
2730 ASSERTM( (h >= 0) && (h < l_), "index h out-of-bounds");
2731 return v[h];
2732 }
2733
2735 CUDA_HOST_DEVICE const PlaneType& operator[](int h) const
2736 {
2737 ASSERTM( (h >= 0) && (h < l_), "index h out-of-bounds");
2738 return v[h];
2739 }
2740
2742 CUDA_HOST_DEVICE Tensor3& operator*=(DataType alpha)
2743 {
2744 for(int i(0); i < l_; ++i)
2745 v[i] *= alpha;
2746 return *this;
2747 }
2748
2750 template<int sla_, int sma_, int sna_>
2752 {
2753 for(int i(0); i < l_; ++i)
2754 v[i] += a.v[i];
2755 return *this;
2756 }
2757
2759 template<int sla_, int sma_, int sna_>
2761 {
2762 for(int i(0); i < l_; ++i)
2763 v[i] -= a.v[i];
2764 return *this;
2765 }
2766
2773 template<int sla_, int sma_, int sna_>
2774 CUDA_HOST_DEVICE void copy(const Tensor3<T_, l_, m_, n_, sla_, sma_, sna_>& a)
2775 {
2776 for(int i(0); i < l_; ++i)
2777 for(int j(0); j < m_; ++j)
2778 for(int k(0); k < n_; ++k)
2779 v[i][j][k] = a.v[i][j][k];
2780 }
2781
2794 template<int ll_,int mm_, int nn_, int la_, int ma_, int na_, int sla_, int sma_, int sna_>
2796 {
2797 static_assert(ll_ <= l_, "invalid copy_n size");
2798 static_assert(ll_ <= la_, "invalid copy_n size");
2799 static_assert(mm_ <= m_, "invalid copy_n size");
2800 static_assert(mm_ <= ma_, "invalid copy_n size");
2801 static_assert(nn_ <= n_, "invalid copy_n size");
2802 static_assert(nn_ <= na_, "invalid copy_n size");
2803 for(int i(0); i < ll_; ++i)
2804 for(int j(0); j < mm_; ++j)
2805 for(int k(0); k < nn_; ++k)
2806 v[i][j][k] = a.v[i][j][k];
2807 }
2808
2810 CUDA_HOST_DEVICE void format(DataType alpha = DataType(0))
2811 {
2812 (*this) = alpha;
2813 }
2814
2835 template<int k_, int sma_, int sna_, int slt_, int smt_, int snt_>
2836 CUDA_HOST_DEVICE Tensor3& add_mat_tensor_mult(
2839 DataType alpha = DataType(1))
2840 {
2841 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
2842 ASSERTM((const void*)this != (const void*)&t, "result tensor and multiplicand tensor 't' must be different objects");
2843
2844 for(int h(0); h < l_; ++h)
2845 {
2846 for(int i(0); i < m_; ++i)
2847 {
2848 for(int j(0); j < n_; ++j)
2849 {
2850 DataType r(0);
2851 for(int p(0); p < k_; ++p)
2852 {
2853 r += a(h,p) * t(p,i,j);
2854 }
2855 operator()(h,i,j) += alpha * r;
2856 }
2857 }
2858 }
2859 return *this;
2860 }
2861
2886 template<
2887 int lt_, int mt_, int nt_, // input tensor dimensions
2888 int slt_, int smt_, int snt_, // input tensor strides
2889 int smb_, int snb_, int smd_, int snd_> // input matrix strides
2890 CUDA_HOST_DEVICE Tensor3& add_double_mat_mult(
2894 DataType alpha = DataType(1))
2895 {
2896 // we have to compare void* addresses here, because we might get a type mismatch error otherwise
2897 ASSERTM((const void*)this != (const void*)&t, "result tensor and multiplicand tensor 't' must be different objects");
2898
2899 for(int h(0); h < l_; ++h)
2900 {
2901 for(int i(0); i < m_; ++i)
2902 {
2903 for(int j(0); j < n_; ++j)
2904 {
2905 DataType r(0);
2906 for(int p(0); p < mt_; ++p)
2907 {
2908 for(int q(0); q < nt_; ++q)
2909 {
2910 r += t(h,p,q) * b(p,i) * d(q,j);
2911 }
2912 }
2913 operator()(h,i,j) += alpha * r;
2914 }
2915 }
2916 }
2917 return *this;
2918 }
2919
2940 template<int slx_, int sma_, int sna_>
2942 const Vector<T_, l_, slx_>& x,
2944 DataType alpha = DataType(1))
2945 {
2946 for(int h(0); h < l_; ++h)
2947 {
2948 for(int i(0); i < m_; ++i)
2949 {
2950 for(int j(0); j < n_; ++j)
2951 {
2952 operator()(h,i,j) += alpha * x(h) * a(i, j);
2953 }
2954 }
2955 }
2956 return *this;
2957 }
2958
2962 CUDA_HOST_DEVICE static Tensor3 null()
2963 {
2964 return Tensor3(DataType(0));
2965 }
2966 }; // class Tensor3<...>
2967
2969 template<typename T_, int l_, int m_, int n_, int sl_, int sm_, int sn_>
2972 {
2973 return Tensor3<T_, l_, m_, n_, sl_, sm_, sn_>(a) *= alpha;
2974 }
2975
2977 template<typename T_, int l_, int m_, int n_, int sl_, int sm_, int sn_>
2980 {
2981 return Tensor3<T_, l_, m_, n_, sl_, sm_, sn_>(a) *= alpha;
2982 }
2983
2984 /* ************************************************************************************************************* */
2985 /* ************************************************************************************************************* */
2986 // Various helper functions
2987 /* ************************************************************************************************************* */
2988 /* ************************************************************************************************************* */
3000 template<typename T_, typename std::enable_if<Intern::DataTypeExtractor<T_>::level == 0, bool>::type = true>
3001 CUDA_HOST_DEVICE inline T_ dot(const T_& a, const T_& b)
3002 {
3003 return a*b;
3004 }
3005
3017 template<typename T_, int n_, int sa_, int sb_>
3018 CUDA_HOST_DEVICE inline typename Vector<T_, n_>::DataType dot(const Vector<T_, n_, sa_>& a, const Vector<T_, n_, sb_>& b)
3019 {
3020 typename Vector<T_, n_>::DataType r(0);
3021
3022 for(int i(0); i < n_; ++i)
3023 {
3024 r += Tiny::dot(a.v[i], b.v[i]);
3025 }
3026 return r;
3027 }
3028
3040 template<typename T_, int m_, int n_, int sma_, int sna_, int smb_, int snb_>
3042 {
3043 typename Matrix<T_, m_, n_>::DataType r(0);
3044 for(int i(0); i < m_; ++i)
3045 {
3046 r += Tiny::dot(a.v[i], b.v[i]);
3047 }
3048 return r;
3049 }
3050
3059 template<typename T_, int l_, int m_, int n_, int sla_ ,int sma_, int sna_, int slb_, int smb_, int snb_>
3060 CUDA_HOST_DEVICE inline typename Tensor3<T_, l_, m_, n_>::DataType dot(
3063 {
3065 for(int i(0); i < l_; ++i)
3066 {
3067 r += Tiny::dot(a.v[i], b.v[i]);
3068 }
3069 return r;
3070 }
3071
3081 template<typename T_>
3082 CUDA_HOST_DEVICE inline void add_id(T_& x, const T_& alpha)
3083 {
3084 x += alpha;
3085 }
3086
3096 template<typename T_, int n_, int sn_>
3097 CUDA_HOST_DEVICE inline void add_id(Vector<T_, n_, sn_>& x, const typename Vector<T_, n_, sn_>::DataType& alpha)
3098 {
3099 for(int i(0); i < n_; ++i)
3100 add_id(x(i), alpha);
3101 }
3102
3112 template<typename T_, int n_, int sm_, int sn_>
3113 CUDA_HOST_DEVICE inline void add_id(Matrix<T_, n_, n_, sm_, sn_>& x, const typename Matrix<T_, n_, n_, sm_, sn_>::DataType& alpha)
3114 {
3115 for(int i(0); i < n_; ++i)
3116 add_id(x(i,i), alpha);
3117 }
3118
3128 template<typename T_, int n_, int sl_, int sm_, int sn_>
3130 {
3131 for(int i(0); i < n_; ++i)
3132 add_id(x(i,i,i), alpha);
3133 }
3134
3149 template<typename T_>
3150 CUDA_HOST_DEVICE inline void axpy(T_& y, const T_& x, const T_& alpha)
3151 {
3152 y += alpha*x;
3153 }
3154
3169 template<typename T_, int n_, int sn_>
3170 CUDA_HOST_DEVICE inline void axpy(
3172 const Vector<T_, n_, sn_>& x,
3173 const typename Vector<T_, n_, sn_>::DataType& alpha)
3174 {
3175 for(int i(0); i < n_; ++i)
3176 axpy(y.v[i], x.v[i], alpha);
3177 }
3178
3193 template<typename T_, int m_, int n_, int sm_, int sn_>
3194 CUDA_HOST_DEVICE inline void axpy(
3197 const typename Matrix<T_, m_, n_, sm_, sn_>::DataType& alpha)
3198 {
3199 for(int i(0); i < m_; ++i)
3200 axpy(y.v[i], x.v[i], alpha);
3201 }
3202
3217 template<typename T_, int l_, int m_, int n_, int sl_, int sm_, int sn_>
3218 CUDA_HOST_DEVICE inline void axpy(
3222 {
3223 for(int i(0); i < l_; ++i)
3224 axpy(y.v[i], x.v[i], alpha);
3225 }
3226
3227 /* ************************************************************************************************************* */
3228 /* ************************************************************************************************************* */
3229 // Internal helpers implementation
3230 /* ************************************************************************************************************* */
3231 /* ************************************************************************************************************* */
3232
3234 namespace Intern
3235 {
3236 // generic square matrix inversion:
3237 template<int n_>
3238 struct DetHelper<n_,n_>
3239 {
3240 template<typename T_, int sma_, int sna_>
3241 CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, n_, n_, sma_, sna_>& a)
3242 {
3243 // perform matrix inversion which returns the determinant
3245 const T_ det = Intern::InverseHelper<n_, n_>::compute(b, a);
3246
3247 // if the returned value is not normal, we can assume that the matrix is singular
3248 #ifndef __CUDACC__
3249 return Math::isnormal(det) ? det : T_(0);
3250 #else
3251 return CudaMath::cuda_isnormal(det) ? det : T_(0);
3252 #endif
3253 }
3254 };
3255
3256 template<>
3257 struct DetHelper<1,1>
3258 {
3259 template<typename T_, int sma_, int sna_>
3260 CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, 1, 1, sma_, sna_>& a)
3261 {
3262 return a(0,0);
3263 }
3264 };
3265
3266
3267 template<>
3268 struct DetHelper<2,2>
3269 {
3270 template<typename T_, int sma_, int sna_>
3271 CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, 2, 2, sma_, sna_>& a)
3272 {
3273 return a(0,0)*a(1,1) - a(0,1)*a(1,0);
3274 }
3275 };
3276
3277 template<>
3278 struct DetHelper<3,3>
3279 {
3280 template<typename T_, int sma_, int sna_>
3281 CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, 3, 3, sma_, sna_>& a)
3282 {
3283 return a(0,0)*(a(1,1)*a(2,2) - a(1,2)*a(2,1))
3284 + a(0,1)*(a(1,2)*a(2,0) - a(1,0)*a(2,2))
3285 + a(0,2)*(a(1,0)*a(2,1) - a(1,1)*a(2,0));
3286 }
3287 };
3288
3289 template<>
3290 struct DetHelper<4,4>
3291 {
3292 template<typename T_, int sma_, int sna_>
3293 CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, 4, 4, sma_, sna_>& a)
3294 {
3295 // 2x2 determinants of rows 3-4
3296 T_ w[6] =
3297 {
3298 a(2,0)*a(3,1) - a(2,1)*a(3,0),
3299 a(2,0)*a(3,2) - a(2,2)*a(3,0),
3300 a(2,0)*a(3,3) - a(2,3)*a(3,0),
3301 a(2,1)*a(3,2) - a(2,2)*a(3,1),
3302 a(2,1)*a(3,3) - a(2,3)*a(3,1),
3303 a(2,2)*a(3,3) - a(2,3)*a(3,2)
3304 };
3305
3306 return
3307 + a(0,0) * (a(1,1)*w[5] - a(1,2)*w[4] + a(1,3)*w[3])
3308 - a(0,1) * (a(1,0)*w[5] - a(1,2)*w[2] + a(1,3)*w[1])
3309 + a(0,2) * (a(1,0)*w[4] - a(1,1)*w[2] + a(1,3)*w[0])
3310 - a(0,3) * (a(1,0)*w[3] - a(1,1)*w[1] + a(1,2)*w[0]);
3311 }
3312 };
3313
3314 template<>
3315 struct DetHelper<5,5>
3316 {
3317 template<typename T_, int sma_, int sna_>
3318 CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, 5, 5, sma_, sna_>& a)
3319 {
3320 // 2x2 determinants of rows 4-5
3321 T_ v[10] =
3322 {
3323 a(3,0)*a(4,1) - a(3,1)*a(4,0),
3324 a(3,0)*a(4,2) - a(3,2)*a(4,0),
3325 a(3,0)*a(4,3) - a(3,3)*a(4,0),
3326 a(3,0)*a(4,4) - a(3,4)*a(4,0),
3327 a(3,1)*a(4,2) - a(3,2)*a(4,1),
3328 a(3,1)*a(4,3) - a(3,3)*a(4,1),
3329 a(3,1)*a(4,4) - a(3,4)*a(4,1),
3330 a(3,2)*a(4,3) - a(3,3)*a(4,2),
3331 a(3,2)*a(4,4) - a(3,4)*a(4,2),
3332 a(3,3)*a(4,4) - a(3,4)*a(4,3)
3333 };
3334 // 3x3 determinants of rows 3-4-5
3335 T_ w[10] =
3336 {
3337 a(2,0)*v[4] - a(2,1)*v[1] + a(2,2)*v[0],
3338 a(2,0)*v[5] - a(2,1)*v[2] + a(2,3)*v[0],
3339 a(2,0)*v[6] - a(2,1)*v[3] + a(2,4)*v[0],
3340 a(2,0)*v[7] - a(2,2)*v[2] + a(2,3)*v[1],
3341 a(2,0)*v[8] - a(2,2)*v[3] + a(2,4)*v[1],
3342 a(2,0)*v[9] - a(2,3)*v[3] + a(2,4)*v[2],
3343 a(2,1)*v[7] - a(2,2)*v[5] + a(2,3)*v[4],
3344 a(2,1)*v[8] - a(2,2)*v[6] + a(2,4)*v[4],
3345 a(2,1)*v[9] - a(2,3)*v[6] + a(2,4)*v[5],
3346 a(2,2)*v[9] - a(2,3)*v[8] + a(2,4)*v[7]
3347 };
3348
3349 return
3350 + a(0,0)*(a(1,1)*w[9] - a(1,2)*w[8] + a(1,3)*w[7] - a(1,4)*w[6])
3351 - a(0,1)*(a(1,0)*w[9] - a(1,2)*w[5] + a(1,3)*w[4] - a(1,4)*w[3])
3352 + a(0,2)*(a(1,0)*w[8] - a(1,1)*w[5] + a(1,3)*w[2] - a(1,4)*w[1])
3353 - a(0,3)*(a(1,0)*w[7] - a(1,1)*w[4] + a(1,2)*w[2] - a(1,4)*w[0])
3354 + a(0,4)*(a(1,0)*w[6] - a(1,1)*w[3] + a(1,2)*w[1] - a(1,3)*w[0]);
3355 }
3356 };
3357
3358 template<>
3359 struct DetHelper<6,6>
3360 {
3361 template<typename T_, int sma_, int sna_>
3362 CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, 6, 6, sma_, sna_>& a)
3363 {
3364 // 2x2 determinants of rows 5-6
3365 T_ v[15] =
3366 {
3367 a(4,0)*a(5,1) - a(4,1)*a(5,0),
3368 a(4,0)*a(5,2) - a(4,2)*a(5,0),
3369 a(4,0)*a(5,3) - a(4,3)*a(5,0),
3370 a(4,0)*a(5,4) - a(4,4)*a(5,0),
3371 a(4,0)*a(5,5) - a(4,5)*a(5,0),
3372 a(4,1)*a(5,2) - a(4,2)*a(5,1),
3373 a(4,1)*a(5,3) - a(4,3)*a(5,1),
3374 a(4,1)*a(5,4) - a(4,4)*a(5,1),
3375 a(4,1)*a(5,5) - a(4,5)*a(5,1),
3376 a(4,2)*a(5,3) - a(4,3)*a(5,2),
3377 a(4,2)*a(5,4) - a(4,4)*a(5,2),
3378 a(4,2)*a(5,5) - a(4,5)*a(5,2),
3379 a(4,3)*a(5,4) - a(4,4)*a(5,3),
3380 a(4,3)*a(5,5) - a(4,5)*a(5,3),
3381 a(4,4)*a(5,5) - a(4,5)*a(5,4)
3382 };
3383 // 3x3 determinants of rows 4-5-6
3384 T_ w[20] =
3385 {
3386 a(3,0)*v[ 5] - a(3,1)*v[ 1] + a(3,2)*v[ 0],
3387 a(3,0)*v[ 6] - a(3,1)*v[ 2] + a(3,3)*v[ 0],
3388 a(3,0)*v[ 7] - a(3,1)*v[ 3] + a(3,4)*v[ 0],
3389 a(3,0)*v[ 8] - a(3,1)*v[ 4] + a(3,5)*v[ 0],
3390 a(3,0)*v[ 9] - a(3,2)*v[ 2] + a(3,3)*v[ 1],
3391 a(3,0)*v[10] - a(3,2)*v[ 3] + a(3,4)*v[ 1],
3392 a(3,0)*v[11] - a(3,2)*v[ 4] + a(3,5)*v[ 1],
3393 a(3,0)*v[12] - a(3,3)*v[ 3] + a(3,4)*v[ 2],
3394 a(3,0)*v[13] - a(3,3)*v[ 4] + a(3,5)*v[ 2],
3395 a(3,0)*v[14] - a(3,4)*v[ 4] + a(3,5)*v[ 3],
3396 a(3,1)*v[ 9] - a(3,2)*v[ 6] + a(3,3)*v[ 5],
3397 a(3,1)*v[10] - a(3,2)*v[ 7] + a(3,4)*v[ 5],
3398 a(3,1)*v[11] - a(3,2)*v[ 8] + a(3,5)*v[ 5],
3399 a(3,1)*v[12] - a(3,3)*v[ 7] + a(3,4)*v[ 6],
3400 a(3,1)*v[13] - a(3,3)*v[ 8] + a(3,5)*v[ 6],
3401 a(3,1)*v[14] - a(3,4)*v[ 8] + a(3,5)*v[ 7],
3402 a(3,2)*v[12] - a(3,3)*v[10] + a(3,4)*v[ 9],
3403 a(3,2)*v[13] - a(3,3)*v[11] + a(3,5)*v[ 9],
3404 a(3,2)*v[14] - a(3,4)*v[11] + a(3,5)*v[10],
3405 a(3,3)*v[14] - a(3,4)*v[13] + a(3,5)*v[12]
3406 };
3407 // 4x4 determinants of rows 3-4-5-6
3408 v[ 0] = a(2,0)*w[10] - a(2,1)*w[ 4] + a(2,2)*w[ 1] - a(2,3)*w[ 0];
3409 v[ 1] = a(2,0)*w[11] - a(2,1)*w[ 5] + a(2,2)*w[ 2] - a(2,4)*w[ 0];
3410 v[ 2] = a(2,0)*w[12] - a(2,1)*w[ 6] + a(2,2)*w[ 3] - a(2,5)*w[ 0];
3411 v[ 3] = a(2,0)*w[13] - a(2,1)*w[ 7] + a(2,3)*w[ 2] - a(2,4)*w[ 1];
3412 v[ 4] = a(2,0)*w[14] - a(2,1)*w[ 8] + a(2,3)*w[ 3] - a(2,5)*w[ 1];
3413 v[ 5] = a(2,0)*w[15] - a(2,1)*w[ 9] + a(2,4)*w[ 3] - a(2,5)*w[ 2];
3414 v[ 6] = a(2,0)*w[16] - a(2,2)*w[ 7] + a(2,3)*w[ 5] - a(2,4)*w[ 4];
3415 v[ 7] = a(2,0)*w[17] - a(2,2)*w[ 8] + a(2,3)*w[ 6] - a(2,5)*w[ 4];
3416 v[ 8] = a(2,0)*w[18] - a(2,2)*w[ 9] + a(2,4)*w[ 6] - a(2,5)*w[ 5];
3417 v[ 9] = a(2,0)*w[19] - a(2,3)*w[ 9] + a(2,4)*w[ 8] - a(2,5)*w[ 7];
3418 v[10] = a(2,1)*w[16] - a(2,2)*w[13] + a(2,3)*w[11] - a(2,4)*w[10];
3419 v[11] = a(2,1)*w[17] - a(2,2)*w[14] + a(2,3)*w[12] - a(2,5)*w[10];
3420 v[12] = a(2,1)*w[18] - a(2,2)*w[15] + a(2,4)*w[12] - a(2,5)*w[11];
3421 v[13] = a(2,1)*w[19] - a(2,3)*w[15] + a(2,4)*w[14] - a(2,5)*w[13];
3422 v[14] = a(2,2)*w[19] - a(2,3)*w[18] + a(2,4)*w[17] - a(2,5)*w[16];
3423
3424 return
3425 + a(0,0)*(a(1,1)*v[14] - a(1,2)*v[13] + a(1,3)*v[12] - a(1,4)*v[11] + a(1,5)*v[10])
3426 - a(0,1)*(a(1,0)*v[14] - a(1,2)*v[ 9] + a(1,3)*v[ 8] - a(1,4)*v[ 7] + a(1,5)*v[ 6])
3427 + a(0,2)*(a(1,0)*v[13] - a(1,1)*v[ 9] + a(1,3)*v[ 5] - a(1,4)*v[ 4] + a(1,5)*v[ 3])
3428 - a(0,3)*(a(1,0)*v[12] - a(1,1)*v[ 8] + a(1,2)*v[ 5] - a(1,4)*v[ 2] + a(1,5)*v[ 1])
3429 + a(0,4)*(a(1,0)*v[11] - a(1,1)*v[ 7] + a(1,2)*v[ 4] - a(1,3)*v[ 2] + a(1,5)*v[ 0])
3430 - a(0,5)*(a(1,0)*v[10] - a(1,1)*v[ 6] + a(1,2)*v[ 3] - a(1,3)*v[ 1] + a(1,4)*v[ 0]);
3431 }
3432 };
3433
3439
3440 template<int m_, int n_>
3441 struct VolHelper
3442 {
3443 template<typename T_, int sma_, int sna_>
3444 CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, n_, n_, sma_, sna_>& a)
3445 {
3446 // generic fallback implementation: compute b := a^T * a and return sqrt(det(b))
3447 Tiny::Matrix<T_, n_, n_> b;
3448 for(int i(0); i < n_; ++i)
3449 {
3450 for(int j(0); j < n_; ++j)
3451 {
3452 b(i,j) = T_(0);
3453 for(int k(0); k < m_; ++k)
3454 {
3455 b(i,j) += a(k,i)*a(k,j);
3456 }
3457 }
3458 }
3459 #ifndef __CUDACC__
3460 return Math::sqrt(DetHelper<n_,n_>::compute(b));
3461 #else
3462 return CudaMath::cuda_sqrt(DetHelper<n_,n_>::compute(b));
3463 #endif
3464 }
3465 };
3466
3467 template<int n_>
3468 struct VolHelper<n_,n_>
3469 {
3470 template<typename T_, int sma_, int sna_>
3471 CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, n_, n_, sma_, sna_>& a)
3472 {
3473 // square matrix special case: vol(a) = abs(det(a))
3474 #ifndef __CUDACC__
3475 return Math::abs(DetHelper<n_,n_>::compute(a));
3476 #else
3477 return CudaMath::cuda_abs(DetHelper<n_,n_>::compute(a));
3478 #endif
3479 }
3480 };
3481
3482 template<>
3483 struct VolHelper<2,1>
3484 {
3485 template<typename T_, int sma_, int sna_>
3486 CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, 2, 1, sma_, sna_>& a)
3487 {
3488 // This is the euclid norm of the only matrix column.
3489 #ifndef __CUDACC__
3490 return Math::sqrt(Math::sqr(a(0,0)) + Math::sqr(a(1,0)));
3491 #else
3492 return CudaMath::cuda_sqrt(CudaMath::cuda_sqr(a(0,0)) + CudaMath::cuda_sqr(a(1,0)));
3493 #endif
3494 }
3495 };
3496
3497 template<>
3498 struct VolHelper<3,1>
3499 {
3500 template<typename T_, int sma_, int sna_>
3501 CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, 3, 1, sma_, sna_>& a)
3502 {
3503 // This is the euclid norm of the only matrix column.
3504 #ifndef __CUDACC__
3505 return Math::sqrt(Math::sqr(a(0,0)) + Math::sqr(a(1,0)) + Math::sqr(a(2,0)));
3506 #else
3507 return CudaMath::cuda_sqrt(CudaMath::cuda_sqr(a(0,0)) + CudaMath::cuda_sqr(a(1,0)) + CudaMath::cuda_sqr(a(2,0)));
3508 #endif
3509 }
3510 };
3511
3512 template<>
3513 struct VolHelper<3,2>
3514 {
3515 template<typename T_, int sma_, int sna_>
3516 CUDA_HOST_DEVICE static T_ compute(const Tiny::Matrix<T_, 3, 2, sma_, sna_>& a)
3517 {
3518 // This is the euclid norm of the 3D cross product of the two matrix columns.
3519 #ifndef __CUDACC__
3520 return Math::sqrt(
3521 Math::sqr(a(1,0)*a(2,1) - a(2,0)*a(1,1)) +
3522 Math::sqr(a(2,0)*a(0,1) - a(0,0)*a(2,1)) +
3523 Math::sqr(a(0,0)*a(1,1) - a(1,0)*a(0,1)));
3524 #else
3525 return CudaMath::cuda_sqrt(
3526 CudaMath::cuda_sqr(a(1,0)*a(2,1) - a(2,0)*a(1,1)) +
3527 CudaMath::cuda_sqr(a(2,0)*a(0,1) - a(0,0)*a(2,1)) +
3528 CudaMath::cuda_sqr(a(0,0)*a(1,1) - a(1,0)*a(0,1)));
3529 #endif
3530 }
3531 };
3532
3538
3539 template<>
3540 struct InverseHelper<1,1>
3541 {
3542 template<typename T_, int smb_, int snb_, int sma_, int sna_>
3543 CUDA_HOST_DEVICE static T_ compute(Tiny::Matrix<T_, 1, 1, smb_, snb_>& b, const Tiny::Matrix<T_, 1, 1, sma_, sna_>& a)
3544 {
3545 b(0,0) = T_(1) / a(0,0);
3546 return a(0,0);
3547 }
3548 };
3549
3550 template<>
3551 struct InverseHelper<2,2>
3552 {
3553 template<typename T_, int smb_, int snb_, int sma_, int sna_>
3554 CUDA_HOST_DEVICE static T_ compute(Tiny::Matrix<T_, 2, 2, smb_, snb_>& b, const Tiny::Matrix<T_, 2, 2, sma_, sna_>& a)
3555 {
3556 T_ det = a(0,0)*a(1,1) - a(0,1)*a(1,0);
3557 T_ d = T_(1) / det;
3558 b(0,0) = d*a(1,1);
3559 b(0,1) = -d*a(0,1);
3560 b(1,0) = -d*a(1,0);
3561 b(1,1) = d*a(0,0);
3562 return det;
3563 }
3564 };
3565
3566 template<>
3567 struct InverseHelper<3,3>
3568 {
3569 template<typename T_, int smb_, int snb_, int sma_, int sna_>
3570 CUDA_HOST_DEVICE static T_ compute(Tiny::Matrix<T_, 3, 3, smb_, snb_>& b, const Tiny::Matrix<T_, 3, 3, sma_, sna_>& a)
3571 {
3572 b(0,0) = a(1,1)*a(2,2) - a(1,2)*a(2,1);
3573 b(1,0) = a(1,2)*a(2,0) - a(1,0)*a(2,2);
3574 b(2,0) = a(1,0)*a(2,1) - a(1,1)*a(2,0);
3575 T_ det = a(0,0)*b(0,0) + a(0,1)*b(1,0) + a(0,2)*b(2,0);
3576 T_ d = T_(1) / det;
3577 b(0,0) *= d;
3578 b(1,0) *= d;
3579 b(2,0) *= d;
3580 b(0,1) = d*(a(0,2)*a(2,1) - a(0,1)*a(2,2));
3581 b(1,1) = d*(a(0,0)*a(2,2) - a(0,2)*a(2,0));
3582 b(2,1) = d*(a(0,1)*a(2,0) - a(0,0)*a(2,1));
3583 b(0,2) = d*(a(0,1)*a(1,2) - a(0,2)*a(1,1));
3584 b(1,2) = d*(a(0,2)*a(1,0) - a(0,0)*a(1,2));
3585 b(2,2) = d*(a(0,0)*a(1,1) - a(0,1)*a(1,0));
3586 return det;
3587 }
3588 };
3589
3590 template<>
3591 struct InverseHelper<4,4>
3592 {
3593 template<typename T_, int smb_, int snb_, int sma_, int sna_>
3594 CUDA_HOST_DEVICE static T_ compute(Tiny::Matrix<T_, 4, 4, smb_, snb_>& b, const Tiny::Matrix<T_, 4, 4, sma_, sna_>& a)
3595 {
3596 T_ w[6];
3597 w[0] = a(2,0)*a(3,1)-a(2,1)*a(3,0);
3598 w[1] = a(2,0)*a(3,2)-a(2,2)*a(3,0);
3599 w[2] = a(2,0)*a(3,3)-a(2,3)*a(3,0);
3600 w[3] = a(2,1)*a(3,2)-a(2,2)*a(3,1);
3601 w[4] = a(2,1)*a(3,3)-a(2,3)*a(3,1);
3602 w[5] = a(2,2)*a(3,3)-a(2,3)*a(3,2);
3603 b(0,0) = a(1,1)*w[5]-a(1,2)*w[4]+a(1,3)*w[3];
3604 b(1,0) =-a(1,0)*w[5]+a(1,2)*w[2]-a(1,3)*w[1];
3605 b(2,0) = a(1,0)*w[4]-a(1,1)*w[2]+a(1,3)*w[0];
3606 b(3,0) =-a(1,0)*w[3]+a(1,1)*w[1]-a(1,2)*w[0];
3607 T_ det = a(0,0)*b(0,0)+a(0,1)*b(1,0)+a(0,2)*b(2,0)+a(0,3)*b(3,0);
3608 T_ d = T_(1) / det;
3609 b(0,0) *= d;
3610 b(1,0) *= d;
3611 b(2,0) *= d;
3612 b(3,0) *= d;
3613 b(0,1) = d*(-a(0,1)*w[5]+a(0,2)*w[4]-a(0,3)*w[3]);
3614 b(1,1) = d*( a(0,0)*w[5]-a(0,2)*w[2]+a(0,3)*w[1]);
3615 b(2,1) = d*(-a(0,0)*w[4]+a(0,1)*w[2]-a(0,3)*w[0]);
3616 b(3,1) = d*( a(0,0)*w[3]-a(0,1)*w[1]+a(0,2)*w[0]);
3617 w[0] = a(0,0)*a(1,1)-a(0,1)*a(1,0);
3618 w[1] = a(0,0)*a(1,2)-a(0,2)*a(1,0);
3619 w[2] = a(0,0)*a(1,3)-a(0,3)*a(1,0);
3620 w[3] = a(0,1)*a(1,2)-a(0,2)*a(1,1);
3621 w[4] = a(0,1)*a(1,3)-a(0,3)*a(1,1);
3622 w[5] = a(0,2)*a(1,3)-a(0,3)*a(1,2);
3623 b(0,2) = d*( a(3,1)*w[5]-a(3,2)*w[4]+a(3,3)*w[3]);
3624 b(1,2) = d*(-a(3,0)*w[5]+a(3,2)*w[2]-a(3,3)*w[1]);
3625 b(2,2) = d*( a(3,0)*w[4]-a(3,1)*w[2]+a(3,3)*w[0]);
3626 b(3,2) = d*(-a(3,0)*w[3]+a(3,1)*w[1]-a(3,2)*w[0]);
3627 b(0,3) = d*(-a(2,1)*w[5]+a(2,2)*w[4]-a(2,3)*w[3]);
3628 b(1,3) = d*( a(2,0)*w[5]-a(2,2)*w[2]+a(2,3)*w[1]);
3629 b(2,3) = d*(-a(2,0)*w[4]+a(2,1)*w[2]-a(2,3)*w[0]);
3630 b(3,3) = d*( a(2,0)*w[3]-a(2,1)*w[1]+a(2,2)*w[0]);
3631 return det;
3632 }
3633 };
3634
3635 template<>
3636 struct InverseHelper<5,5>
3637 {
3638 template<typename T_, int smb_, int snb_, int sma_, int sna_>
3639 CUDA_HOST_DEVICE static T_ compute(Tiny::Matrix<T_, 5, 5, smb_, snb_>& b, const Tiny::Matrix<T_, 5, 5, sma_, sna_>& a)
3640 {
3641 T_ w[20];
3642 w[ 0] = a(3,0)*a(4,1)-a(3,1)*a(4,0);
3643 w[ 1] = a(3,0)*a(4,2)-a(3,2)*a(4,0);
3644 w[ 2] = a(3,0)*a(4,3)-a(3,3)*a(4,0);
3645 w[ 3] = a(3,0)*a(4,4)-a(3,4)*a(4,0);
3646 w[ 4] = a(3,1)*a(4,2)-a(3,2)*a(4,1);
3647 w[ 5] = a(3,1)*a(4,3)-a(3,3)*a(4,1);
3648 w[ 6] = a(3,1)*a(4,4)-a(3,4)*a(4,1);
3649 w[ 7] = a(3,2)*a(4,3)-a(3,3)*a(4,2);
3650 w[ 8] = a(3,2)*a(4,4)-a(3,4)*a(4,2);
3651 w[ 9] = a(3,3)*a(4,4)-a(3,4)*a(4,3);
3652 w[10] = a(2,0)*w[4]-a(2,1)*w[1]+a(2,2)*w[0];
3653 w[11] = a(2,0)*w[5]-a(2,1)*w[2]+a(2,3)*w[0];
3654 w[12] = a(2,0)*w[6]-a(2,1)*w[3]+a(2,4)*w[0];
3655 w[13] = a(2,0)*w[7]-a(2,2)*w[2]+a(2,3)*w[1];
3656 w[14] = a(2,0)*w[8]-a(2,2)*w[3]+a(2,4)*w[1];
3657 w[15] = a(2,0)*w[9]-a(2,3)*w[3]+a(2,4)*w[2];
3658 w[16] = a(2,1)*w[7]-a(2,2)*w[5]+a(2,3)*w[4];
3659 w[17] = a(2,1)*w[8]-a(2,2)*w[6]+a(2,4)*w[4];
3660 w[18] = a(2,1)*w[9]-a(2,3)*w[6]+a(2,4)*w[5];
3661 w[19] = a(2,2)*w[9]-a(2,3)*w[8]+a(2,4)*w[7];
3662 b(0,0) = a(1,1)*w[19]-a(1,2)*w[18]+a(1,3)*w[17]-a(1,4)*w[16];
3663 b(1,0) =-a(1,0)*w[19]+a(1,2)*w[15]-a(1,3)*w[14]+a(1,4)*w[13];
3664 b(2,0) = a(1,0)*w[18]-a(1,1)*w[15]+a(1,3)*w[12]-a(1,4)*w[11];
3665 b(3,0) =-a(1,0)*w[17]+a(1,1)*w[14]-a(1,2)*w[12]+a(1,4)*w[10];
3666 b(4,0) = a(1,0)*w[16]-a(1,1)*w[13]+a(1,2)*w[11]-a(1,3)*w[10];
3667 T_ det = a(0,0)*b(0,0)+a(0,1)*b(1,0)+a(0,2)*b(2,0)+a(0,3)*b(3,0)+a(0,4)*b(4,0);
3668 T_ d = T_(1) / det;
3669 b(0,0) *= d;
3670 b(1,0) *= d;
3671 b(2,0) *= d;
3672 b(3,0) *= d;
3673 b(4,0) *= d;
3674 b(0,1) = d*(-a(0,1)*w[19]+a(0,2)*w[18]-a(0,3)*w[17]+a(0,4)*w[16]);
3675 b(1,1) = d*( a(0,0)*w[19]-a(0,2)*w[15]+a(0,3)*w[14]-a(0,4)*w[13]);
3676 b(2,1) = d*(-a(0,0)*w[18]+a(0,1)*w[15]-a(0,3)*w[12]+a(0,4)*w[11]);
3677 b(3,1) = d*( a(0,0)*w[17]-a(0,1)*w[14]+a(0,2)*w[12]-a(0,4)*w[10]);
3678 b(4,1) = d*(-a(0,0)*w[16]+a(0,1)*w[13]-a(0,2)*w[11]+a(0,3)*w[10]);
3679 w[10] = a(1,0)*w[4]-a(1,1)*w[1]+a(1,2)*w[0];
3680 w[11] = a(1,0)*w[5]-a(1,1)*w[2]+a(1,3)*w[0];
3681 w[12] = a(1,0)*w[6]-a(1,1)*w[3]+a(1,4)*w[0];
3682 w[13] = a(1,0)*w[7]-a(1,2)*w[2]+a(1,3)*w[1];
3683 w[14] = a(1,0)*w[8]-a(1,2)*w[3]+a(1,4)*w[1];
3684 w[15] = a(1,0)*w[9]-a(1,3)*w[3]+a(1,4)*w[2];
3685 w[16] = a(1,1)*w[7]-a(1,2)*w[5]+a(1,3)*w[4];
3686 w[17] = a(1,1)*w[8]-a(1,2)*w[6]+a(1,4)*w[4];
3687 w[18] = a(1,1)*w[9]-a(1,3)*w[6]+a(1,4)*w[5];
3688 w[19] = a(1,2)*w[9]-a(1,3)*w[8]+a(1,4)*w[7];
3689 b(0,2) = d*( a(0,1)*w[19]-a(0,2)*w[18]+a(0,3)*w[17]-a(0,4)*w[16]);
3690 b(1,2) = d*(-a(0,0)*w[19]+a(0,2)*w[15]-a(0,3)*w[14]+a(0,4)*w[13]);
3691 b(2,2) = d*( a(0,0)*w[18]-a(0,1)*w[15]+a(0,3)*w[12]-a(0,4)*w[11]);
3692 b(3,2) = d*(-a(0,0)*w[17]+a(0,1)*w[14]-a(0,2)*w[12]+a(0,4)*w[10]);
3693 b(4,2) = d*( a(0,0)*w[16]-a(0,1)*w[13]+a(0,2)*w[11]-a(0,3)*w[10]);
3694 w[ 0] = a(0,0)*a(1,1)-a(0,1)*a(1,0);
3695 w[ 1] = a(0,0)*a(1,2)-a(0,2)*a(1,0);
3696 w[ 2] = a(0,0)*a(1,3)-a(0,3)*a(1,0);
3697 w[ 3] = a(0,0)*a(1,4)-a(0,4)*a(1,0);
3698 w[ 4] = a(0,1)*a(1,2)-a(0,2)*a(1,1);
3699 w[ 5] = a(0,1)*a(1,3)-a(0,3)*a(1,1);
3700 w[ 6] = a(0,1)*a(1,4)-a(0,4)*a(1,1);
3701 w[ 7] = a(0,2)*a(1,3)-a(0,3)*a(1,2);
3702 w[ 8] = a(0,2)*a(1,4)-a(0,4)*a(1,2);
3703 w[ 9] = a(0,3)*a(1,4)-a(0,4)*a(1,3);
3704 w[10] = a(2,0)*w[4]-a(2,1)*w[1]+a(2,2)*w[0];
3705 w[11] = a(2,0)*w[5]-a(2,1)*w[2]+a(2,3)*w[0];
3706 w[12] = a(2,0)*w[6]-a(2,1)*w[3]+a(2,4)*w[0];
3707 w[13] = a(2,0)*w[7]-a(2,2)*w[2]+a(2,3)*w[1];
3708 w[14] = a(2,0)*w[8]-a(2,2)*w[3]+a(2,4)*w[1];
3709 w[15] = a(2,0)*w[9]-a(2,3)*w[3]+a(2,4)*w[2];
3710 w[16] = a(2,1)*w[7]-a(2,2)*w[5]+a(2,3)*w[4];
3711 w[17] = a(2,1)*w[8]-a(2,2)*w[6]+a(2,4)*w[4];
3712 w[18] = a(2,1)*w[9]-a(2,3)*w[6]+a(2,4)*w[5];
3713 w[19] = a(2,2)*w[9]-a(2,3)*w[8]+a(2,4)*w[7];
3714 b(0,3) = d*( a(4,1)*w[19]-a(4,2)*w[18]+a(4,3)*w[17]-a(4,4)*w[16]);
3715 b(1,3) = d*(-a(4,0)*w[19]+a(4,2)*w[15]-a(4,3)*w[14]+a(4,4)*w[13]);
3716 b(2,3) = d*( a(4,0)*w[18]-a(4,1)*w[15]+a(4,3)*w[12]-a(4,4)*w[11]);
3717 b(3,3) = d*(-a(4,0)*w[17]+a(4,1)*w[14]-a(4,2)*w[12]+a(4,4)*w[10]);
3718 b(4,3) = d*( a(4,0)*w[16]-a(4,1)*w[13]+a(4,2)*w[11]-a(4,3)*w[10]);
3719 b(0,4) = d*(-a(3,1)*w[19]+a(3,2)*w[18]-a(3,3)*w[17]+a(3,4)*w[16]);
3720 b(1,4) = d*( a(3,0)*w[19]-a(3,2)*w[15]+a(3,3)*w[14]-a(3,4)*w[13]);
3721 b(2,4) = d*(-a(3,0)*w[18]+a(3,1)*w[15]-a(3,3)*w[12]+a(3,4)*w[11]);
3722 b(3,4) = d*( a(3,0)*w[17]-a(3,1)*w[14]+a(3,2)*w[12]-a(3,4)*w[10]);
3723 b(4,4) = d*(-a(3,0)*w[16]+a(3,1)*w[13]-a(3,2)*w[11]+a(3,3)*w[10]);
3724 return det;
3725 }
3726 };
3727
3728 template<>
3729 struct InverseHelper<6,6>
3730 {
3731 template<typename T_, int smb_, int snb_, int sma_, int sna_>
3732 CUDA_HOST_DEVICE static T_ compute(Tiny::Matrix<T_, 6, 6, smb_, snb_>& b, const Tiny::Matrix<T_, 6, 6, sma_, sna_>& a)
3733 {
3734 T_ w[35];
3735 w[ 0] = a(4,0)*a(5,1)-a(4,1)*a(5,0);
3736 w[ 1] = a(4,0)*a(5,2)-a(4,2)*a(5,0);
3737 w[ 2] = a(4,0)*a(5,3)-a(4,3)*a(5,0);
3738 w[ 3] = a(4,0)*a(5,4)-a(4,4)*a(5,0);
3739 w[ 4] = a(4,0)*a(5,5)-a(4,5)*a(5,0);
3740 w[ 5] = a(4,1)*a(5,2)-a(4,2)*a(5,1);
3741 w[ 6] = a(4,1)*a(5,3)-a(4,3)*a(5,1);
3742 w[ 7] = a(4,1)*a(5,4)-a(4,4)*a(5,1);
3743 w[ 8] = a(4,1)*a(5,5)-a(4,5)*a(5,1);
3744 w[ 9] = a(4,2)*a(5,3)-a(4,3)*a(5,2);
3745 w[10] = a(4,2)*a(5,4)-a(4,4)*a(5,2);
3746 w[11] = a(4,2)*a(5,5)-a(4,5)*a(5,2);
3747 w[12] = a(4,3)*a(5,4)-a(4,4)*a(5,3);
3748 w[13] = a(4,3)*a(5,5)-a(4,5)*a(5,3);
3749 w[14] = a(4,4)*a(5,5)-a(4,5)*a(5,4);
3750 w[15] = a(3,0)*w[5]-a(3,1)*w[1]+a(3,2)*w[0];
3751 w[16] = a(3,0)*w[6]-a(3,1)*w[2]+a(3,3)*w[0];
3752 w[17] = a(3,0)*w[7]-a(3,1)*w[3]+a(3,4)*w[0];
3753 w[18] = a(3,0)*w[8]-a(3,1)*w[4]+a(3,5)*w[0];
3754 w[19] = a(3,0)*w[9]-a(3,2)*w[2]+a(3,3)*w[1];
3755 w[20] = a(3,0)*w[10]-a(3,2)*w[3]+a(3,4)*w[1];
3756 w[21] = a(3,0)*w[11]-a(3,2)*w[4]+a(3,5)*w[1];
3757 w[22] = a(3,0)*w[12]-a(3,3)*w[3]+a(3,4)*w[2];
3758 w[23] = a(3,0)*w[13]-a(3,3)*w[4]+a(3,5)*w[2];
3759 w[24] = a(3,0)*w[14]-a(3,4)*w[4]+a(3,5)*w[3];
3760 w[25] = a(3,1)*w[9]-a(3,2)*w[6]+a(3,3)*w[5];
3761 w[26] = a(3,1)*w[10]-a(3,2)*w[7]+a(3,4)*w[5];
3762 w[27] = a(3,1)*w[11]-a(3,2)*w[8]+a(3,5)*w[5];
3763 w[28] = a(3,1)*w[12]-a(3,3)*w[7]+a(3,4)*w[6];
3764 w[29] = a(3,1)*w[13]-a(3,3)*w[8]+a(3,5)*w[6];
3765 w[30] = a(3,1)*w[14]-a(3,4)*w[8]+a(3,5)*w[7];
3766 w[31] = a(3,2)*w[12]-a(3,3)*w[10]+a(3,4)*w[9];
3767 w[32] = a(3,2)*w[13]-a(3,3)*w[11]+a(3,5)*w[9];
3768 w[33] = a(3,2)*w[14]-a(3,4)*w[11]+a(3,5)*w[10];
3769 w[34] = a(3,3)*w[14]-a(3,4)*w[13]+a(3,5)*w[12];
3770 w[ 0] = a(2,0)*w[25]-a(2,1)*w[19]+a(2,2)*w[16]-a(2,3)*w[15];
3771 w[ 1] = a(2,0)*w[26]-a(2,1)*w[20]+a(2,2)*w[17]-a(2,4)*w[15];
3772 w[ 2] = a(2,0)*w[27]-a(2,1)*w[21]+a(2,2)*w[18]-a(2,5)*w[15];
3773 w[ 3] = a(2,0)*w[28]-a(2,1)*w[22]+a(2,3)*w[17]-a(2,4)*w[16];
3774 w[ 4] = a(2,0)*w[29]-a(2,1)*w[23]+a(2,3)*w[18]-a(2,5)*w[16];
3775 w[ 5] = a(2,0)*w[30]-a(2,1)*w[24]+a(2,4)*w[18]-a(2,5)*w[17];
3776 w[ 6] = a(2,0)*w[31]-a(2,2)*w[22]+a(2,3)*w[20]-a(2,4)*w[19];
3777 w[ 7] = a(2,0)*w[32]-a(2,2)*w[23]+a(2,3)*w[21]-a(2,5)*w[19];
3778 w[ 8] = a(2,0)*w[33]-a(2,2)*w[24]+a(2,4)*w[21]-a(2,5)*w[20];
3779 w[ 9] = a(2,0)*w[34]-a(2,3)*w[24]+a(2,4)*w[23]-a(2,5)*w[22];
3780 w[10] = a(2,1)*w[31]-a(2,2)*w[28]+a(2,3)*w[26]-a(2,4)*w[25];
3781 w[11] = a(2,1)*w[32]-a(2,2)*w[29]+a(2,3)*w[27]-a(2,5)*w[25];
3782 w[12] = a(2,1)*w[33]-a(2,2)*w[30]+a(2,4)*w[27]-a(2,5)*w[26];
3783 w[13] = a(2,1)*w[34]-a(2,3)*w[30]+a(2,4)*w[29]-a(2,5)*w[28];
3784 w[14] = a(2,2)*w[34]-a(2,3)*w[33]+a(2,4)*w[32]-a(2,5)*w[31];
3785 b(0,0) = a(1,1)*w[14]-a(1,2)*w[13]+a(1,3)*w[12]-a(1,4)*w[11]+a(1,5)*w[10];
3786 b(1,0) = -a(1,0)*w[14]+a(1,2)*w[9]-a(1,3)*w[8]+a(1,4)*w[7]-a(1,5)*w[6];
3787 b(2,0) = a(1,0)*w[13]-a(1,1)*w[9]+a(1,3)*w[5]-a(1,4)*w[4]+a(1,5)*w[3];
3788 b(3,0) = -a(1,0)*w[12]+a(1,1)*w[8]-a(1,2)*w[5]+a(1,4)*w[2]-a(1,5)*w[1];
3789 b(4,0) = a(1,0)*w[11]-a(1,1)*w[7]+a(1,2)*w[4]-a(1,3)*w[2]+a(1,5)*w[0];
3790 b(5,0) = -a(1,0)*w[10]+a(1,1)*w[6]-a(1,2)*w[3]+a(1,3)*w[1]-a(1,4)*w[0];
3791 T_ det = a(0,0)*b(0,0) + a(0,1)*b(1,0) + a(0,2)*b(2,0)
3792 + a(0,3)*b(3,0) + a(0,4)*b(4,0) + a(0,5)*b(5,0);
3793 T_ d = T_(1) / det;
3794 b(0,0) *= d;
3795 b(1,0) *= d;
3796 b(2,0) *= d;
3797 b(3,0) *= d;
3798 b(4,0) *= d;
3799 b(5,0) *= d;
3800 b(0,1) = d*(-a(0,1)*w[14]+a(0,2)*w[13]-a(0,3)*w[12]+a(0,4)*w[11]-a(0,5)*w[10]);
3801 b(1,1) = d*( a(0,0)*w[14]-a(0,2)*w[9]+a(0,3)*w[8]-a(0,4)*w[7]+a(0,5)*w[6]);
3802 b(2,1) = d*(-a(0,0)*w[13]+a(0,1)*w[9]-a(0,3)*w[5]+a(0,4)*w[4]-a(0,5)*w[3]);
3803 b(3,1) = d*( a(0,0)*w[12]-a(0,1)*w[8]+a(0,2)*w[5]-a(0,4)*w[2]+a(0,5)*w[1]);
3804 b(4,1) = d*(-a(0,0)*w[11]+a(0,1)*w[7]-a(0,2)*w[4]+a(0,3)*w[2]-a(0,5)*w[0]);
3805 b(5,1) = d*( a(0,0)*w[10]-a(0,1)*w[6]+a(0,2)*w[3]-a(0,3)*w[1]+a(0,4)*w[0]);
3806 w[ 0] = a(4,0)*a(5,1)-a(4,1)*a(5,0);
3807 w[ 1] = a(4,0)*a(5,2)-a(4,2)*a(5,0);
3808 w[ 2] = a(4,0)*a(5,3)-a(4,3)*a(5,0);
3809 w[ 3] = a(4,0)*a(5,4)-a(4,4)*a(5,0);
3810 w[ 4] = a(4,0)*a(5,5)-a(4,5)*a(5,0);
3811 w[ 5] = a(4,1)*a(5,2)-a(4,2)*a(5,1);
3812 w[ 6] = a(4,1)*a(5,3)-a(4,3)*a(5,1);
3813 w[ 7] = a(4,1)*a(5,4)-a(4,4)*a(5,1);
3814 w[ 8] = a(4,1)*a(5,5)-a(4,5)*a(5,1);
3815 w[ 9] = a(4,2)*a(5,3)-a(4,3)*a(5,2);
3816 w[10] = a(4,2)*a(5,4)-a(4,4)*a(5,2);
3817 w[11] = a(4,2)*a(5,5)-a(4,5)*a(5,2);
3818 w[12] = a(4,3)*a(5,4)-a(4,4)*a(5,3);
3819 w[13] = a(4,3)*a(5,5)-a(4,5)*a(5,3);
3820 w[14] = a(4,4)*a(5,5)-a(4,5)*a(5,4);
3821 w[15] = a(1,0)*w[5]-a(1,1)*w[1]+a(1,2)*w[0];
3822 w[16] = a(1,0)*w[6]-a(1,1)*w[2]+a(1,3)*w[0];
3823 w[17] = a(1,0)*w[7]-a(1,1)*w[3]+a(1,4)*w[0];
3824 w[18] = a(1,0)*w[8]-a(1,1)*w[4]+a(1,5)*w[0];
3825 w[19] = a(1,0)*w[9]-a(1,2)*w[2]+a(1,3)*w[1];
3826 w[20] = a(1,0)*w[10]-a(1,2)*w[3]+a(1,4)*w[1];
3827 w[21] = a(1,0)*w[11]-a(1,2)*w[4]+a(1,5)*w[1];
3828 w[22] = a(1,0)*w[12]-a(1,3)*w[3]+a(1,4)*w[2];
3829 w[23] = a(1,0)*w[13]-a(1,3)*w[4]+a(1,5)*w[2];
3830 w[24] = a(1,0)*w[14]-a(1,4)*w[4]+a(1,5)*w[3];
3831 w[25] = a(1,1)*w[9]-a(1,2)*w[6]+a(1,3)*w[5];
3832 w[26] = a(1,1)*w[10]-a(1,2)*w[7]+a(1,4)*w[5];
3833 w[27] = a(1,1)*w[11]-a(1,2)*w[8]+a(1,5)*w[5];
3834 w[28] = a(1,1)*w[12]-a(1,3)*w[7]+a(1,4)*w[6];
3835 w[29] = a(1,1)*w[13]-a(1,3)*w[8]+a(1,5)*w[6];
3836 w[30] = a(1,1)*w[14]-a(1,4)*w[8]+a(1,5)*w[7];
3837 w[31] = a(1,2)*w[12]-a(1,3)*w[10]+a(1,4)*w[9];
3838 w[32] = a(1,2)*w[13]-a(1,3)*w[11]+a(1,5)*w[9];
3839 w[33] = a(1,2)*w[14]-a(1,4)*w[11]+a(1,5)*w[10];
3840 w[34] = a(1,3)*w[14]-a(1,4)*w[13]+a(1,5)*w[12];
3841 w[ 0] = a(0,0)*w[25]-a(0,1)*w[19]+a(0,2)*w[16]-a(0,3)*w[15];
3842 w[ 1] = a(0,0)*w[26]-a(0,1)*w[20]+a(0,2)*w[17]-a(0,4)*w[15];
3843 w[ 2] = a(0,0)*w[27]-a(0,1)*w[21]+a(0,2)*w[18]-a(0,5)*w[15];
3844 w[ 3] = a(0,0)*w[28]-a(0,1)*w[22]+a(0,3)*w[17]-a(0,4)*w[16];
3845 w[ 4] = a(0,0)*w[29]-a(0,1)*w[23]+a(0,3)*w[18]-a(0,5)*w[16];
3846 w[ 5] = a(0,0)*w[30]-a(0,1)*w[24]+a(0,4)*w[18]-a(0,5)*w[17];
3847 w[ 6] = a(0,0)*w[31]-a(0,2)*w[22]+a(0,3)*w[20]-a(0,4)*w[19];
3848 w[ 7] = a(0,0)*w[32]-a(0,2)*w[23]+a(0,3)*w[21]-a(0,5)*w[19];
3849 w[ 8] = a(0,0)*w[33]-a(0,2)*w[24]+a(0,4)*w[21]-a(0,5)*w[20];
3850 w[ 9] = a(0,0)*w[34]-a(0,3)*w[24]+a(0,4)*w[23]-a(0,5)*w[22];
3851 w[10] = a(0,1)*w[31]-a(0,2)*w[28]+a(0,3)*w[26]-a(0,4)*w[25];
3852 w[11] = a(0,1)*w[32]-a(0,2)*w[29]+a(0,3)*w[27]-a(0,5)*w[25];
3853 w[12] = a(0,1)*w[33]-a(0,2)*w[30]+a(0,4)*w[27]-a(0,5)*w[26];
3854 w[13] = a(0,1)*w[34]-a(0,3)*w[30]+a(0,4)*w[29]-a(0,5)*w[28];
3855 w[14] = a(0,2)*w[34]-a(0,3)*w[33]+a(0,4)*w[32]-a(0,5)*w[31];
3856 b(0,2) = d*( a(3,1)*w[14]-a(3,2)*w[13]+a(3,3)*w[12]-a(3,4)*w[11]+a(3,5)*w[10]);
3857 b(1,2) = d*(-a(3,0)*w[14]+a(3,2)*w[9]-a(3,3)*w[8]+a(3,4)*w[7]-a(3,5)*w[6]);
3858 b(2,2) = d*( a(3,0)*w[13]-a(3,1)*w[9]+a(3,3)*w[5]-a(3,4)*w[4]+a(3,5)*w[3]);
3859 b(3,2) = d*(-a(3,0)*w[12]+a(3,1)*w[8]-a(3,2)*w[5]+a(3,4)*w[2]-a(3,5)*w[1]);
3860 b(4,2) = d*( a(3,0)*w[11]-a(3,1)*w[7]+a(3,2)*w[4]-a(3,3)*w[2]+a(3,5)*w[0]);
3861 b(5,2) = d*(-a(3,0)*w[10]+a(3,1)*w[6]-a(3,2)*w[3]+a(3,3)*w[1]-a(3,4)*w[0]);
3862 b(0,3) = d*(-a(2,1)*w[14]+a(2,2)*w[13]-a(2,3)*w[12]+a(2,4)*w[11]-a(2,5)*w[10]);
3863 b(1,3) = d*( a(2,0)*w[14]-a(2,2)*w[9]+a(2,3)*w[8]-a(2,4)*w[7]+a(2,5)*w[6]);
3864 b(2,3) = d*(-a(2,0)*w[13]+a(2,1)*w[9]-a(2,3)*w[5]+a(2,4)*w[4]-a(2,5)*w[3]);
3865 b(3,3) = d*( a(2,0)*w[12]-a(2,1)*w[8]+a(2,2)*w[5]-a(2,4)*w[2]+a(2,5)*w[1]);
3866 b(4,3) = d*(-a(2,0)*w[11]+a(2,1)*w[7]-a(2,2)*w[4]+a(2,3)*w[2]-a(2,5)*w[0]);
3867 b(5,3) = d*( a(2,0)*w[10]-a(2,1)*w[6]+a(2,2)*w[3]-a(2,3)*w[1]+a(2,4)*w[0]);
3868 w[ 0] = a(2,0)*a(3,1)-a(2,1)*a(3,0);
3869 w[ 1] = a(2,0)*a(3,2)-a(2,2)*a(3,0);
3870 w[ 2] = a(2,0)*a(3,3)-a(2,3)*a(3,0);
3871 w[ 3] = a(2,0)*a(3,4)-a(2,4)*a(3,0);
3872 w[ 4] = a(2,0)*a(3,5)-a(2,5)*a(3,0);
3873 w[ 5] = a(2,1)*a(3,2)-a(2,2)*a(3,1);
3874 w[ 6] = a(2,1)*a(3,3)-a(2,3)*a(3,1);
3875 w[ 7] = a(2,1)*a(3,4)-a(2,4)*a(3,1);
3876 w[ 8] = a(2,1)*a(3,5)-a(2,5)*a(3,1);
3877 w[ 9] = a(2,2)*a(3,3)-a(2,3)*a(3,2);
3878 w[10] = a(2,2)*a(3,4)-a(2,4)*a(3,2);
3879 w[11] = a(2,2)*a(3,5)-a(2,5)*a(3,2);
3880 w[12] = a(2,3)*a(3,4)-a(2,4)*a(3,3);
3881 w[13] = a(2,3)*a(3,5)-a(2,5)*a(3,3);
3882 w[14] = a(2,4)*a(3,5)-a(2,5)*a(3,4);
3883 w[15] = a(1,0)*w[5]-a(1,1)*w[1]+a(1,2)*w[0];
3884 w[16] = a(1,0)*w[6]-a(1,1)*w[2]+a(1,3)*w[0];
3885 w[17] = a(1,0)*w[7]-a(1,1)*w[3]+a(1,4)*w[0];
3886 w[18] = a(1,0)*w[8]-a(1,1)*w[4]+a(1,5)*w[0];
3887 w[19] = a(1,0)*w[9]-a(1,2)*w[2]+a(1,3)*w[1];
3888 w[20] = a(1,0)*w[10]-a(1,2)*w[3]+a(1,4)*w[1];
3889 w[21] = a(1,0)*w[11]-a(1,2)*w[4]+a(1,5)*w[1];
3890 w[22] = a(1,0)*w[12]-a(1,3)*w[3]+a(1,4)*w[2];
3891 w[23] = a(1,0)*w[13]-a(1,3)*w[4]+a(1,5)*w[2];
3892 w[24] = a(1,0)*w[14]-a(1,4)*w[4]+a(1,5)*w[3];
3893 w[25] = a(1,1)*w[9]-a(1,2)*w[6]+a(1,3)*w[5];
3894 w[26] = a(1,1)*w[10]-a(1,2)*w[7]+a(1,4)*w[5];
3895 w[27] = a(1,1)*w[11]-a(1,2)*w[8]+a(1,5)*w[5];
3896 w[28] = a(1,1)*w[12]-a(1,3)*w[7]+a(1,4)*w[6];
3897 w[29] = a(1,1)*w[13]-a(1,3)*w[8]+a(1,5)*w[6];
3898 w[30] = a(1,1)*w[14]-a(1,4)*w[8]+a(1,5)*w[7];
3899 w[31] = a(1,2)*w[12]-a(1,3)*w[10]+a(1,4)*w[9];
3900 w[32] = a(1,2)*w[13]-a(1,3)*w[11]+a(1,5)*w[9];
3901 w[33] = a(1,2)*w[14]-a(1,4)*w[11]+a(1,5)*w[10];
3902 w[34] = a(1,3)*w[14]-a(1,4)*w[13]+a(1,5)*w[12];
3903 w[ 0] = a(0,0)*w[25]-a(0,1)*w[19]+a(0,2)*w[16]-a(0,3)*w[15];
3904 w[ 1] = a(0,0)*w[26]-a(0,1)*w[20]+a(0,2)*w[17]-a(0,4)*w[15];
3905 w[ 2] = a(0,0)*w[27]-a(0,1)*w[21]+a(0,2)*w[18]-a(0,5)*w[15];
3906 w[ 3] = a(0,0)*w[28]-a(0,1)*w[22]+a(0,3)*w[17]-a(0,4)*w[16];
3907 w[ 4] = a(0,0)*w[29]-a(0,1)*w[23]+a(0,3)*w[18]-a(0,5)*w[16];
3908 w[ 5] = a(0,0)*w[30]-a(0,1)*w[24]+a(0,4)*w[18]-a(0,5)*w[17];
3909 w[ 6] = a(0,0)*w[31]-a(0,2)*w[22]+a(0,3)*w[20]-a(0,4)*w[19];
3910 w[ 7] = a(0,0)*w[32]-a(0,2)*w[23]+a(0,3)*w[21]-a(0,5)*w[19];
3911 w[ 8] = a(0,0)*w[33]-a(0,2)*w[24]+a(0,4)*w[21]-a(0,5)*w[20];
3912 w[ 9] = a(0,0)*w[34]-a(0,3)*w[24]+a(0,4)*w[23]-a(0,5)*w[22];
3913 w[10] = a(0,1)*w[31]-a(0,2)*w[28]+a(0,3)*w[26]-a(0,4)*w[25];
3914 w[11] = a(0,1)*w[32]-a(0,2)*w[29]+a(0,3)*w[27]-a(0,5)*w[25];
3915 w[12] = a(0,1)*w[33]-a(0,2)*w[30]+a(0,4)*w[27]-a(0,5)*w[26];
3916 w[13] = a(0,1)*w[34]-a(0,3)*w[30]+a(0,4)*w[29]-a(0,5)*w[28];
3917 w[14] = a(0,2)*w[34]-a(0,3)*w[33]+a(0,4)*w[32]-a(0,5)*w[31];
3918 b(0,4) = d*( a(5,1)*w[14]-a(5,2)*w[13]+a(5,3)*w[12]-a(5,4)*w[11]+a(5,5)*w[10]);
3919 b(1,4) = d*(-a(5,0)*w[14]+a(5,2)*w[9]-a(5,3)*w[8]+a(5,4)*w[7]-a(5,5)*w[6]);
3920 b(2,4) = d*( a(5,0)*w[13]-a(5,1)*w[9]+a(5,3)*w[5]-a(5,4)*w[4]+a(5,5)*w[3]);
3921 b(3,4) = d*(-a(5,0)*w[12]+a(5,1)*w[8]-a(5,2)*w[5]+a(5,4)*w[2]-a(5,5)*w[1]);
3922 b(4,4) = d*( a(5,0)*w[11]-a(5,1)*w[7]+a(5,2)*w[4]-a(5,3)*w[2]+a(5,5)*w[0]);
3923 b(5,4) = d*(-a(5,0)*w[10]+a(5,1)*w[6]-a(5,2)*w[3]+a(5,3)*w[1]-a(5,4)*w[0]);
3924 b(0,5) = d*(-a(4,1)*w[14]+a(4,2)*w[13]-a(4,3)*w[12]+a(4,4)*w[11]-a(4,5)*w[10]);
3925 b(1,5) = d*( a(4,0)*w[14]-a(4,2)*w[9]+a(4,3)*w[8]-a(4,4)*w[7]+a(4,5)*w[6]);
3926 b(2,5) = d*(-a(4,0)*w[13]+a(4,1)*w[9]-a(4,3)*w[5]+a(4,4)*w[4]-a(4,5)*w[3]);
3927 b(3,5) = d*( a(4,0)*w[12]-a(4,1)*w[8]+a(4,2)*w[5]-a(4,4)*w[2]+a(4,5)*w[1]);
3928 b(4,5) = d*(-a(4,0)*w[11]+a(4,1)*w[7]-a(4,2)*w[4]+a(4,3)*w[2]-a(4,5)*w[0]);
3929 b(5,5) = d*( a(4,0)*w[10]-a(4,1)*w[6]+a(4,2)*w[3]-a(4,3)*w[1]+a(4,4)*w[0]);
3930 return det;
3931 }
3932 };
3933
3934 // generic square matrix inversion:
3935 template<int n_>
3936 struct InverseHelper<n_, n_>
3937 {
3938 template<typename T_, int smb_, int snb_, int sma_, int sna_>
3939 CUDA_HOST_DEVICE static T_ compute(Tiny::Matrix<T_, n_, n_, smb_, snb_>& b, const Tiny::Matrix<T_, n_, n_, sma_, sna_>& a)
3940 {
3941 // copy matrix a to b
3942 for (int i(0); i < n_; ++i)
3943 {
3944 for (int j(0); j < n_; ++j)
3945 {
3946 b(i,j) = a(i,j);
3947 }
3948 }
3949
3950 // create pivot array
3951 int p[n_];
3952
3953 // initialize identity permutation
3954 for(int i(0); i < n_; ++i)
3955 {
3956 p[i] = i;
3957 }
3958
3959 // initialize determinant to 1
3960 T_ det = T_(1);
3961
3962 // primary column elimination loop
3963 for(int k(0); k < n_; ++k)
3964 {
3965 // step 1: find a pivot for the elimination of column k
3966 {
3967 // for this, we only check the rows p[j] with j >= k, as all
3968 // rows p[j] with j < k have already been eliminated and are
3969 // therefore not candidates for pivoting
3970 #ifdef __CUDACC__
3971 T_ pivot = CudaMath::cuda_abs(b(p[k], p[k]));
3972 #else
3973 T_ pivot = Math::abs(b(p[k], p[k]));
3974 #endif
3975 int i = k;
3976
3977 // loop over all unprocessed rows
3978 for(int j(k+1); j < n_; ++j)
3979 {
3980 // get our matrix value and check whether it can be a pivot
3981 #ifdef __CUDACC__
3982 T_ abs_val = CudaMath::cuda_abs(b(p[j], p[j]));
3983 #else
3984 T_ abs_val = Math::abs(b(p[j], p[j]));
3985 #endif
3986 if(abs_val > pivot)
3987 {
3988 pivot = abs_val;
3989 i = j;
3990 }
3991 }
3992
3993 // do we have to swap rows i and k?
3994 if(i > k)
3995 {
3996 // swap rows "virtually" by exchanging their permutation positions
3997 int t = p[k];
3998 p[k] = p[i];
3999 p[i] = t;
4000 }
4001 }
4002
4003 // step 2: process pivot row
4004 {
4005 // update determinant by multiplying with the pivot element
4006 det *= b(p[k], p[k]);
4007
4008 // get our inverted pivot element
4009 const T_ pivot = T_(1) / b(p[k], p[k]);
4010
4011 // replace column entry by unit column entry
4012 b(p[k], p[k]) = T_(1);
4013
4014 // divide the whole row by the inverted pivot
4015 for(int j(0); j < n_; ++j)
4016 {
4017 b(p[k], j) *= pivot;
4018 }
4019 }
4020
4021 // step 3: eliminate pivot column
4022
4023 // loop over all rows of the matrix
4024 for(int i(0); i < n_; ++i)
4025 {
4026 // skip the pivot row
4027 if(i == p[k])
4028 continue;
4029
4030 // fetch elimination factor
4031 const T_ factor = b(i, p[k]);
4032
4033 // replace by unit column entry
4034 b(i, p[k]) = T_(0);
4035
4036 // process the row
4037 for(int j(0); j < n_; ++j)
4038 {
4039 b(i, j) -= b(p[k], j) * factor;
4040 }
4041 }
4042 }
4043
4044 // return determinant
4045 return det;
4046 }
4047 };
4048
4054
4055 #ifdef __CUDACC__
4056 template<>
4057 struct CudaGroupedInverseHelper<1,1>
4058 {
4059 template<typename T_, typename ThreadGroup_, int smb_, int snb_, int sma_, int sna_>
4060 CUDA_DEVICE static __forceinline__ void compute(const ThreadGroup_& tg, Tiny::Matrix<T_, 1, 1, smb_, snb_>& b, const Tiny::Matrix<T_, 1, 1, sma_, sna_>&, const T_& det)
4061 {
4062 if(tg.thread_rank() == 0)
4063 b(0,0) = T_(1) / det;
4064 }
4065 };
4066
4067 template<>
4068 struct CudaGroupedInverseHelper<2,2>
4069 {
4070 template<typename T_, typename ThreadGroup_, int smb_, int snb_, int sma_, int sna_>
4071 CUDA_DEVICE static __forceinline__ void compute(const ThreadGroup_& tg, Tiny::Matrix<T_, 2, 2, smb_, snb_>& b, const Tiny::Matrix<T_, 2, 2, sma_, sna_>& a, const T_& det)
4072 {
4073 //i think an if else cascade could do better than heavy modulo operations...
4074 T_ d = T_(1) / det;
4075
4076 for(int idx = tg.thread_rank(); idx < 4; idx += tg.num_threads())
4077 {
4078 const int i = idx /2;
4079 const int j = idx %2;
4080 b(i,0) = ((i+j)==1 ? T_(-1) : T_(1)) * d * a(1-j,1-i);
4081
4082 }
4083 b(0,0) = d*a(1,1);
4084 b(0,1) = -d*a(0,1);
4085 b(1,0) = -d*a(1,0);
4086 b(1,1) = d*a(0,0);
4087 }
4088 };
4089
4090 template<>
4091 struct CudaGroupedInverseHelper<3,3>
4092 {
4093 template<typename T_, typename ThreadGroup_, int smb_, int snb_, int sma_, int sna_>
4094 CUDA_DEVICE static __forceinline__ void compute(const ThreadGroup_& tg, Tiny::Matrix<T_, 3, 3, smb_, snb_>& b, const Tiny::Matrix<T_, 3, 3, sma_, sna_>& a, const T_& det)
4095 {
4096 // for(int i = tg.thread_rank(); i < 3; i += tg.num_threads())
4097 // {
4098 // b[i*snb_+0] = a[1*sna_+1+i-3*(i/2)]*a[2*sna_+(2*(1-i+i/2))] - a[1*sna_+(2*(1-i+i/2))]*a[2*sna_+1+i-3*(i/2)];
4099 // }
4100 // tg.sync();
4101 // T_ det = a[0*sna_+0]*b[0*snb_+0] + a[0*sna_+1]*b[1*snb_+0] + a[0*sna_+2]*b[2*snb_+0];
4102 // T_ d = T_(1) / det;
4103 // for(int idx = tg.thread_rank(); idx < 3; idx += tg.num_threads())
4104 // {
4105 // const int i = idx/3;
4106 // const int j = idx%3;
4107 // b[i*snb_+j] = d*(a[()*sna_+1]*a[2*sna_+2] - a[1*sna_+2]*a[2*sna_+1])
4108
4109 // }
4110 //i think an if else cascade could do better than heavy modulo operations...
4111 T_ d = T_(1) / det;
4112
4113 for(int idx = tg.thread_rank(); idx < 9; idx += tg.num_threads())
4114 {
4115 if(idx == 0)
4116 b(0,0) = d*(a(1,1)*a(2,2) - a(1,2)*a(2,1));
4117 else if(idx == 3)
4118 b(1,0) = d*(a(1,2)*a(2,0) - a(1,0)*a(2,2));
4119 else if(idx == 6)
4120 b(2,0) = d*(a(1,0)*a(2,1) - a(1,1)*a(2,0));
4121 else if(idx == 1)
4122 b(0,1) = d*(a(0,2)*a(2,1) - a(0,1)*a(2,2));
4123 else if(idx == 4)
4124 b(1,1) = d*(a(0,0)*a(2,2) - a(0,2)*a(2,0));
4125 else if(idx == 7)
4126 b(2,1) = d*(a(0,1)*a(2,0) - a(0,0)*a(2,1));
4127 else if(idx == 2)
4128 b(0,2) = d*(a(0,1)*a(1,2) - a(0,2)*a(1,1));
4129 else if(idx == 5)
4130 b(1,2) = d*(a(0,2)*a(1,0) - a(0,0)*a(1,2));
4131 else if(idx == 8)
4132 b(2,2) = d*(a(0,0)*a(1,1) - a(0,1)*a(1,0));
4133 }
4134 }
4135 };
4136
4137 // generic square matrix inversion:
4138 template<int n_>
4139 struct CudaGroupedInverseHelper<n_, n_>
4140 {
4141 template<typename T_, typename ThreadGroup_, int smb_, int snb_, int sma_, int sna_>
4142 CUDA_DEVICE static __forceinline__ void compute(const ThreadGroup_& tg, Tiny::Matrix<T_, n_, n_, smb_, snb_>& b, const Tiny::Matrix<T_, n_, n_, sma_, sna_>& a, const T_&)
4143 {
4144 // copy matrix a to b
4145 for (int i = tg.thread_rank(); i < n_*n_; i += tg.num_threads())
4146 {
4147 const int row = i/n_;
4148 const int col = i%n_;
4149 b.v[row][col] = a.v[row][col];
4150 }
4151
4152 // create shared pivot array
4153 __shared__ int p[n_];
4154 for (int i = tg.thread_rank(); i < n_; i += tg.num_threads())
4155 {
4156 p[i] = 0;
4157 }
4158 tg.sync();
4159 // call grouped invert from first warp subgroup
4160 if(tg.thread_rank() < 32)
4161 {
4162 auto a_g = cg::coalesced_threads();
4163
4164 CudaMath::cuda_grouped_invert_matrix(a_g, n_, snb_, &b.v[0][0], p);
4165 }
4166 tg.sync();
4167 }
4168 };
4169 #endif // __CUDACC__
4170
4176
4177 template<int m_, int n_>
4178 struct CofactorHelper
4179 {
4180 template<typename T_, int smb_, int snb_, int sma_, int sna_>
4181 CUDA_HOST_DEVICE static void compute(Tiny::Matrix<T_, m_, n_, smb_, snb_>& b, const Tiny::Matrix<T_, m_, n_, sma_, sna_>& a)
4182 {
4183 static_assert(m_ == n_, "cofactor computation is only available for square matrices!");
4184
4185 // compute inverse
4186 const T_ det = Intern::InverseHelper<m_,n_>::compute(b, a);
4187
4188 for(int i(0); i < m_; ++i)
4189 {
4190 for(int j(0); j <= i; ++j)
4191 {
4192 b(i,j) *= det;
4193 }
4194 for(int j(i+1); j < n_; ++j)
4195 {
4196 std::swap(b(i,j), b(j,i));
4197 b(i,j) *= det;
4198 }
4199 }
4200 }
4201 };
4202
4203 template<>
4204 struct CofactorHelper<1,1>
4205 {
4206 template<typename T_, int smb_, int snb_, int sma_, int sna_>
4207 CUDA_HOST_DEVICE static void compute(Tiny::Matrix<T_, 1, 1, smb_, snb_>& b, const Tiny::Matrix<T_, 1, 1, sma_, sna_>&)
4208 {
4209 b[0] = T_(1);
4210 }
4211 };
4212
4213 template<>
4214 struct CofactorHelper<2,2>
4215 {
4216 template<typename T_, int smb_, int snb_, int sma_, int sna_>
4217 CUDA_HOST_DEVICE static void compute(Tiny::Matrix<T_, 2, 2, smb_, snb_>& b, const Tiny::Matrix<T_, 2, 2, sma_, sna_>& a)
4218 {
4219 b(0,0) = a(1,1);
4220 b(0,1) = -a(0,1);
4221 b(1,0) = -a(1,0);
4222 b(1,1) = a(0,0);
4223 }
4224 };
4225
4226 template<>
4227 struct CofactorHelper<3,3>
4228 {
4229 template<typename T_, int smb_, int snb_, int sma_, int sna_>
4230 CUDA_HOST_DEVICE static void compute(Tiny::Matrix<T_, 3, 3, smb_, snb_>& b, const Tiny::Matrix<T_, 3, 3, sma_, sna_>& a)
4231 {
4232 b(0,0) = a(1,1)*a(2,2) - a(1,2)*a(2,1);
4233 b(1,0) = a(1,2)*a(2,0) - a(1,0)*a(2,2);
4234 b(2,0) = a(1,0)*a(2,1) - a(1,1)*a(2,0);
4235 b(0,1) = a(0,2)*a(2,1) - a(0,1)*a(2,2);
4236 b(1,1) = a(0,0)*a(2,2) - a(0,2)*a(2,0);
4237 b(2,1) = a(0,1)*a(2,0) - a(0,0)*a(2,1);
4238 b(0,2) = a(0,1)*a(1,2) - a(0,2)*a(1,1);
4239 b(1,2) = a(0,2)*a(1,0) - a(0,0)*a(1,2);
4240 b(2,2) = a(0,0)*a(1,1) - a(0,1)*a(1,0);
4241 }
4242 };
4243
4244 template<>
4245 struct CofactorHelper<4,4>
4246 {
4247 template<typename T_, int smb_, int snb_, int sma_, int sna_>
4248 CUDA_HOST_DEVICE static void compute(Tiny::Matrix<T_, 4, 4, smb_, snb_>& b, const Tiny::Matrix<T_, 4, 4, sma_, sna_>& a)
4249 {
4250 T_ w[6];
4251 w[0] = a(2,0)*a(3,1)-a(2,1)*a(3,0);
4252 w[1] = a(2,0)*a(3,2)-a(2,2)*a(3,0);
4253 w[2] = a(2,0)*a(3,3)-a(2,3)*a(3,0);
4254 w[3] = a(2,1)*a(3,2)-a(2,2)*a(3,1);
4255 w[4] = a(2,1)*a(3,3)-a(2,3)*a(3,1);
4256 w[5] = a(2,2)*a(3,3)-a(2,3)*a(3,2);
4257
4258 b(0,0) = a(1,1)*w[5]-a(1,2)*w[4]+a(1,3)*w[3];
4259 b(1,0) =-a(1,0)*w[5]+a(1,2)*w[2]-a(1,3)*w[1];
4260 b(2,0) = a(1,0)*w[4]-a(1,1)*w[2]+a(1,3)*w[0];
4261 b(3,0) =-a(1,0)*w[3]+a(1,1)*w[1]-a(1,2)*w[0];
4262
4263 b(0,1) =-a(0,1)*w[5]+a(0,2)*w[4]-a(0,3)*w[3];
4264 b(1,1) = a(0,0)*w[5]-a(0,2)*w[2]+a(0,3)*w[1];
4265 b(2,1) =-a(0,0)*w[4]+a(0,1)*w[2]-a(0,3)*w[0];
4266 b(3,1) = a(0,0)*w[3]-a(0,1)*w[1]+a(0,2)*w[0];
4267
4268 w[0] = a(0,0)*a(1,1)-a(0,1)*a(1,0);
4269 w[1] = a(0,0)*a(1,2)-a(0,2)*a(1,0);
4270 w[2] = a(0,0)*a(1,3)-a(0,3)*a(1,0);
4271 w[3] = a(0,1)*a(1,2)-a(0,2)*a(1,1);
4272 w[4] = a(0,1)*a(1,3)-a(0,3)*a(1,1);
4273 w[5] = a(0,2)*a(1,3)-a(0,3)*a(1,2);
4274
4275 b(0,2) = a(3,1)*w[5]-a(3,2)*w[4]+a(3,3)*w[3];
4276 b(1,2) =-a(3,0)*w[5]+a(3,2)*w[2]-a(3,3)*w[1];
4277 b(2,2) = a(3,0)*w[4]-a(3,1)*w[2]+a(3,3)*w[0];
4278 b(3,2) =-a(3,0)*w[3]+a(3,1)*w[1]-a(3,2)*w[0];
4279
4280 b(0,3) =-a(2,1)*w[5]+a(2,2)*w[4]-a(2,3)*w[3];
4281 b(1,3) = a(2,0)*w[5]-a(2,2)*w[2]+a(2,3)*w[1];
4282 b(2,3) =-a(2,0)*w[4]+a(2,1)*w[2]-a(2,3)*w[0];
4283 b(3,3) = a(2,0)*w[3]-a(2,1)*w[1]+a(2,2)*w[0];
4284 }
4285 };
4286
4287 template<>
4288 struct CofactorHelper<5,5>
4289 {
4290 template<typename T_, int smb_, int snb_, int sma_, int sna_>
4291 CUDA_HOST_DEVICE static void compute(Tiny::Matrix<T_, 5, 5, smb_, snb_>& b, const Tiny::Matrix<T_, 5, 5, sma_, sna_>& a)
4292 {
4293 T_ w[20];
4294 w[ 0] = a(3,0)*a(4,1)-a(3,1)*a(4,0);
4295 w[ 1] = a(3,0)*a(4,2)-a(3,2)*a(4,0);
4296 w[ 2] = a(3,0)*a(4,3)-a(3,3)*a(4,0);
4297 w[ 3] = a(3,0)*a(4,4)-a(3,4)*a(4,0);
4298 w[ 4] = a(3,1)*a(4,2)-a(3,2)*a(4,1);
4299 w[ 5] = a(3,1)*a(4,3)-a(3,3)*a(4,1);
4300 w[ 6] = a(3,1)*a(4,4)-a(3,4)*a(4,1);
4301 w[ 7] = a(3,2)*a(4,3)-a(3,3)*a(4,2);
4302 w[ 8] = a(3,2)*a(4,4)-a(3,4)*a(4,2);
4303 w[ 9] = a(3,3)*a(4,4)-a(3,4)*a(4,3);
4304 w[10] = a(2,0)*w[4]-a(2,1)*w[1]+a(2,2)*w[0];
4305 w[11] = a(2,0)*w[5]-a(2,1)*w[2]+a(2,3)*w[0];
4306 w[12] = a(2,0)*w[6]-a(2,1)*w[3]+a(2,4)*w[0];
4307 w[13] = a(2,0)*w[7]-a(2,2)*w[2]+a(2,3)*w[1];
4308 w[14] = a(2,0)*w[8]-a(2,2)*w[3]+a(2,4)*w[1];
4309 w[15] = a(2,0)*w[9]-a(2,3)*w[3]+a(2,4)*w[2];
4310 w[16] = a(2,1)*w[7]-a(2,2)*w[5]+a(2,3)*w[4];
4311 w[17] = a(2,1)*w[8]-a(2,2)*w[6]+a(2,4)*w[4];
4312 w[18] = a(2,1)*w[9]-a(2,3)*w[6]+a(2,4)*w[5];
4313 w[19] = a(2,2)*w[9]-a(2,3)*w[8]+a(2,4)*w[7];
4314
4315 b(0,0) = a(1,1)*w[19]-a(1,2)*w[18]+a(1,3)*w[17]-a(1,4)*w[16];
4316 b(1,0) =-a(1,0)*w[19]+a(1,2)*w[15]-a(1,3)*w[14]+a(1,4)*w[13];
4317 b(2,0) = a(1,0)*w[18]-a(1,1)*w[15]+a(1,3)*w[12]-a(1,4)*w[11];
4318 b(3,0) =-a(1,0)*w[17]+a(1,1)*w[14]-a(1,2)*w[12]+a(1,4)*w[10];
4319 b(4,0) = a(1,0)*w[16]-a(1,1)*w[13]+a(1,2)*w[11]-a(1,3)*w[10];
4320
4321 b(0,1) =-a(0,1)*w[19]+a(0,2)*w[18]-a(0,3)*w[17]+a(0,4)*w[16];
4322 b(1,1) = a(0,0)*w[19]-a(0,2)*w[15]+a(0,3)*w[14]-a(0,4)*w[13];
4323 b(2,1) =-a(0,0)*w[18]+a(0,1)*w[15]-a(0,3)*w[12]+a(0,4)*w[11];
4324 b(3,1) = a(0,0)*w[17]-a(0,1)*w[14]+a(0,2)*w[12]-a(0,4)*w[10];
4325 b(4,1) =-a(0,0)*w[16]+a(0,1)*w[13]-a(0,2)*w[11]+a(0,3)*w[10];
4326
4327 w[10] = a(1,0)*w[4]-a(1,1)*w[1]+a(1,2)*w[0];
4328 w[11] = a(1,0)*w[5]-a(1,1)*w[2]+a(1,3)*w[0];
4329 w[12] = a(1,0)*w[6]-a(1,1)*w[3]+a(1,4)*w[0];
4330 w[13] = a(1,0)*w[7]-a(1,2)*w[2]+a(1,3)*w[1];
4331 w[14] = a(1,0)*w[8]-a(1,2)*w[3]+a(1,4)*w[1];
4332 w[15] = a(1,0)*w[9]-a(1,3)*w[3]+a(1,4)*w[2];
4333 w[16] = a(1,1)*w[7]-a(1,2)*w[5]+a(1,3)*w[4];
4334 w[17] = a(1,1)*w[8]-a(1,2)*w[6]+a(1,4)*w[4];
4335 w[18] = a(1,1)*w[9]-a(1,3)*w[6]+a(1,4)*w[5];
4336 w[19] = a(1,2)*w[9]-a(1,3)*w[8]+a(1,4)*w[7];
4337
4338 b(0,2) = a(0,1)*w[19]-a(0,2)*w[18]+a(0,3)*w[17]-a(0,4)*w[16];
4339 b(1,2) =-a(0,0)*w[19]+a(0,2)*w[15]-a(0,3)*w[14]+a(0,4)*w[13];
4340 b(2,2) = a(0,0)*w[18]-a(0,1)*w[15]+a(0,3)*w[12]-a(0,4)*w[11];
4341 b(3,2) =-a(0,0)*w[17]+a(0,1)*w[14]-a(0,2)*w[12]+a(0,4)*w[10];
4342 b(4,2) = a(0,0)*w[16]-a(0,1)*w[13]+a(0,2)*w[11]-a(0,3)*w[10];
4343
4344 w[ 0] = a(0,0)*a(1,1)-a(0,1)*a(1,0);
4345 w[ 1] = a(0,0)*a(1,2)-a(0,2)*a(1,0);
4346 w[ 2] = a(0,0)*a(1,3)-a(0,3)*a(1,0);
4347 w[ 3] = a(0,0)*a(1,4)-a(0,4)*a(1,0);
4348 w[ 4] = a(0,1)*a(1,2)-a(0,2)*a(1,1);
4349 w[ 5] = a(0,1)*a(1,3)-a(0,3)*a(1,1);
4350 w[ 6] = a(0,1)*a(1,4)-a(0,4)*a(1,1);
4351 w[ 7] = a(0,2)*a(1,3)-a(0,3)*a(1,2);
4352 w[ 8] = a(0,2)*a(1,4)-a(0,4)*a(1,2);
4353 w[ 9] = a(0,3)*a(1,4)-a(0,4)*a(1,3);
4354 w[10] = a(2,0)*w[4]-a(2,1)*w[1]+a(2,2)*w[0];
4355 w[11] = a(2,0)*w[5]-a(2,1)*w[2]+a(2,3)*w[0];
4356 w[12] = a(2,0)*w[6]-a(2,1)*w[3]+a(2,4)*w[0];
4357 w[13] = a(2,0)*w[7]-a(2,2)*w[2]+a(2,3)*w[1];
4358 w[14] = a(2,0)*w[8]-a(2,2)*w[3]+a(2,4)*w[1];
4359 w[15] = a(2,0)*w[9]-a(2,3)*w[3]+a(2,4)*w[2];
4360 w[16] = a(2,1)*w[7]-a(2,2)*w[5]+a(2,3)*w[4];
4361 w[17] = a(2,1)*w[8]-a(2,2)*w[6]+a(2,4)*w[4];
4362 w[18] = a(2,1)*w[9]-a(2,3)*w[6]+a(2,4)*w[5];
4363 w[19] = a(2,2)*w[9]-a(2,3)*w[8]+a(2,4)*w[7];
4364
4365 b(0,3) = a(4,1)*w[19]-a(4,2)*w[18]+a(4,3)*w[17]-a(4,4)*w[16];
4366 b(1,3) = -a(4,0)*w[19]+a(4,2)*w[15]-a(4,3)*w[14]+a(4,4)*w[13];
4367 b(2,3) = a(4,0)*w[18]-a(4,1)*w[15]+a(4,3)*w[12]-a(4,4)*w[11];
4368 b(3,3) = -a(4,0)*w[17]+a(4,1)*w[14]-a(4,2)*w[12]+a(4,4)*w[10];
4369 b(4,3) = a(4,0)*w[16]-a(4,1)*w[13]+a(4,2)*w[11]-a(4,3)*w[10];
4370
4371 b(0,4) = -a(3,1)*w[19]+a(3,2)*w[18]-a(3,3)*w[17]+a(3,4)*w[16];
4372 b(1,4) = a(3,0)*w[19]-a(3,2)*w[15]+a(3,3)*w[14]-a(3,4)*w[13];
4373 b(2,4) = -a(3,0)*w[18]+a(3,1)*w[15]-a(3,3)*w[12]+a(3,4)*w[11];
4374 b(3,4) = a(3,0)*w[17]-a(3,1)*w[14]+a(3,2)*w[12]-a(3,4)*w[10];
4375 b(4,4) = -a(3,0)*w[16]+a(3,1)*w[13]-a(3,2)*w[11]+a(3,3)*w[10];
4376 }
4377 };
4378
4379 template<>
4380 struct CofactorHelper<6,6>
4381 {
4382 template<typename T_, int smb_, int snb_, int sma_, int sna_>
4383 CUDA_HOST_DEVICE static void compute(Tiny::Matrix<T_, 6, 6, smb_, snb_>& b, const Tiny::Matrix<T_, 6, 6, sma_, sna_>& a)
4384 {
4385 T_ w[35];
4386 w[ 0] = a(4,0)*a(5,1)-a(4,1)*a(5,0);
4387 w[ 1] = a(4,0)*a(5,2)-a(4,2)*a(5,0);
4388 w[ 2] = a(4,0)*a(5,3)-a(4,3)*a(5,0);
4389 w[ 3] = a(4,0)*a(5,4)-a(4,4)*a(5,0);
4390 w[ 4] = a(4,0)*a(5,5)-a(4,5)*a(5,0);
4391 w[ 5] = a(4,1)*a(5,2)-a(4,2)*a(5,1);
4392 w[ 6] = a(4,1)*a(5,3)-a(4,3)*a(5,1);
4393 w[ 7] = a(4,1)*a(5,4)-a(4,4)*a(5,1);
4394 w[ 8] = a(4,1)*a(5,5)-a(4,5)*a(5,1);
4395 w[ 9] = a(4,2)*a(5,3)-a(4,3)*a(5,2);
4396 w[10] = a(4,2)*a(5,4)-a(4,4)*a(5,2);
4397 w[11] = a(4,2)*a(5,5)-a(4,5)*a(5,2);
4398 w[12] = a(4,3)*a(5,4)-a(4,4)*a(5,3);
4399 w[13] = a(4,3)*a(5,5)-a(4,5)*a(5,3);
4400 w[14] = a(4,4)*a(5,5)-a(4,5)*a(5,4);
4401 w[15] = a(3,0)*w[5]-a(3,1)*w[1]+a(3,2)*w[0];
4402 w[16] = a(3,0)*w[6]-a(3,1)*w[2]+a(3,3)*w[0];
4403 w[17] = a(3,0)*w[7]-a(3,1)*w[3]+a(3,4)*w[0];
4404 w[18] = a(3,0)*w[8]-a(3,1)*w[4]+a(3,5)*w[0];
4405 w[19] = a(3,0)*w[9]-a(3,2)*w[2]+a(3,3)*w[1];
4406 w[20] = a(3,0)*w[10]-a(3,2)*w[3]+a(3,4)*w[1];
4407 w[21] = a(3,0)*w[11]-a(3,2)*w[4]+a(3,5)*w[1];
4408 w[22] = a(3,0)*w[12]-a(3,3)*w[3]+a(3,4)*w[2];
4409 w[23] = a(3,0)*w[13]-a(3,3)*w[4]+a(3,5)*w[2];
4410 w[24] = a(3,0)*w[14]-a(3,4)*w[4]+a(3,5)*w[3];
4411 w[25] = a(3,1)*w[9]-a(3,2)*w[6]+a(3,3)*w[5];
4412 w[26] = a(3,1)*w[10]-a(3,2)*w[7]+a(3,4)*w[5];
4413 w[27] = a(3,1)*w[11]-a(3,2)*w[8]+a(3,5)*w[5];
4414 w[28] = a(3,1)*w[12]-a(3,3)*w[7]+a(3,4)*w[6];
4415 w[29] = a(3,1)*w[13]-a(3,3)*w[8]+a(3,5)*w[6];
4416 w[30] = a(3,1)*w[14]-a(3,4)*w[8]+a(3,5)*w[7];
4417 w[31] = a(3,2)*w[12]-a(3,3)*w[10]+a(3,4)*w[9];
4418 w[32] = a(3,2)*w[13]-a(3,3)*w[11]+a(3,5)*w[9];
4419 w[33] = a(3,2)*w[14]-a(3,4)*w[11]+a(3,5)*w[10];
4420 w[34] = a(3,3)*w[14]-a(3,4)*w[13]+a(3,5)*w[12];
4421
4422 w[ 0] = a(2,0)*w[25]-a(2,1)*w[19]+a(2,2)*w[16]-a(2,3)*w[15];
4423 w[ 1] = a(2,0)*w[26]-a(2,1)*w[20]+a(2,2)*w[17]-a(2,4)*w[15];
4424 w[ 2] = a(2,0)*w[27]-a(2,1)*w[21]+a(2,2)*w[18]-a(2,5)*w[15];
4425 w[ 3] = a(2,0)*w[28]-a(2,1)*w[22]+a(2,3)*w[17]-a(2,4)*w[16];
4426 w[ 4] = a(2,0)*w[29]-a(2,1)*w[23]+a(2,3)*w[18]-a(2,5)*w[16];
4427 w[ 5] = a(2,0)*w[30]-a(2,1)*w[24]+a(2,4)*w[18]-a(2,5)*w[17];
4428 w[ 6] = a(2,0)*w[31]-a(2,2)*w[22]+a(2,3)*w[20]-a(2,4)*w[19];
4429 w[ 7] = a(2,0)*w[32]-a(2,2)*w[23]+a(2,3)*w[21]-a(2,5)*w[19];
4430 w[ 8] = a(2,0)*w[33]-a(2,2)*w[24]+a(2,4)*w[21]-a(2,5)*w[20];
4431 w[ 9] = a(2,0)*w[34]-a(2,3)*w[24]+a(2,4)*w[23]-a(2,5)*w[22];
4432 w[10] = a(2,1)*w[31]-a(2,2)*w[28]+a(2,3)*w[26]-a(2,4)*w[25];
4433 w[11] = a(2,1)*w[32]-a(2,2)*w[29]+a(2,3)*w[27]-a(2,5)*w[25];
4434 w[12] = a(2,1)*w[33]-a(2,2)*w[30]+a(2,4)*w[27]-a(2,5)*w[26];
4435 w[13] = a(2,1)*w[34]-a(2,3)*w[30]+a(2,4)*w[29]-a(2,5)*w[28];
4436 w[14] = a(2,2)*w[34]-a(2,3)*w[33]+a(2,4)*w[32]-a(2,5)*w[31];
4437
4438 b(0,0) = a(1,1)*w[14]-a(1,2)*w[13]+a(1,3)*w[12]-a(1,4)*w[11]+a(1,5)*w[10];
4439 b(1,0) = -a(1,0)*w[14]+a(1,2)*w[9]-a(1,3)*w[8]+a(1,4)*w[7]-a(1,5)*w[6];
4440 b(2,0) = a(1,0)*w[13]-a(1,1)*w[9]+a(1,3)*w[5]-a(1,4)*w[4]+a(1,5)*w[3];
4441 b(3,0) = -a(1,0)*w[12]+a(1,1)*w[8]-a(1,2)*w[5]+a(1,4)*w[2]-a(1,5)*w[1];
4442 b(4,0) = a(1,0)*w[11]-a(1,1)*w[7]+a(1,2)*w[4]-a(1,3)*w[2]+a(1,5)*w[0];
4443 b(5,0) = -a(1,0)*w[10]+a(1,1)*w[6]-a(1,2)*w[3]+a(1,3)*w[1]-a(1,4)*w[0];
4444
4445 b(0,1) =-a(0,1)*w[14]+a(0,2)*w[13]-a(0,3)*w[12]+a(0,4)*w[11]-a(0,5)*w[10];
4446 b(1,1) = a(0,0)*w[14]-a(0,2)*w[9]+a(0,3)*w[8]-a(0,4)*w[7]+a(0,5)*w[6];
4447 b(2,1) =-a(0,0)*w[13]+a(0,1)*w[9]-a(0,3)*w[5]+a(0,4)*w[4]-a(0,5)*w[3];
4448 b(3,1) = a(0,0)*w[12]-a(0,1)*w[8]+a(0,2)*w[5]-a(0,4)*w[2]+a(0,5)*w[1];
4449 b(4,1) =-a(0,0)*w[11]+a(0,1)*w[7]-a(0,2)*w[4]+a(0,3)*w[2]-a(0,5)*w[0];
4450 b(5,1) = a(0,0)*w[10]-a(0,1)*w[6]+a(0,2)*w[3]-a(0,3)*w[1]+a(0,4)*w[0];
4451
4452 w[ 0] = a(4,0)*a(5,1)-a(4,1)*a(5,0);
4453 w[ 1] = a(4,0)*a(5,2)-a(4,2)*a(5,0);
4454 w[ 2] = a(4,0)*a(5,3)-a(4,3)*a(5,0);
4455 w[ 3] = a(4,0)*a(5,4)-a(4,4)*a(5,0);
4456 w[ 4] = a(4,0)*a(5,5)-a(4,5)*a(5,0);
4457 w[ 5] = a(4,1)*a(5,2)-a(4,2)*a(5,1);
4458 w[ 6] = a(4,1)*a(5,3)-a(4,3)*a(5,1);
4459 w[ 7] = a(4,1)*a(5,4)-a(4,4)*a(5,1);
4460 w[ 8] = a(4,1)*a(5,5)-a(4,5)*a(5,1);
4461 w[ 9] = a(4,2)*a(5,3)-a(4,3)*a(5,2);
4462 w[10] = a(4,2)*a(5,4)-a(4,4)*a(5,2);
4463 w[11] = a(4,2)*a(5,5)-a(4,5)*a(5,2);
4464 w[12] = a(4,3)*a(5,4)-a(4,4)*a(5,3);
4465 w[13] = a(4,3)*a(5,5)-a(4,5)*a(5,3);
4466 w[14] = a(4,4)*a(5,5)-a(4,5)*a(5,4);
4467 w[15] = a(1,0)*w[5]-a(1,1)*w[1]+a(1,2)*w[0];
4468 w[16] = a(1,0)*w[6]-a(1,1)*w[2]+a(1,3)*w[0];
4469 w[17] = a(1,0)*w[7]-a(1,1)*w[3]+a(1,4)*w[0];
4470 w[18] = a(1,0)*w[8]-a(1,1)*w[4]+a(1,5)*w[0];
4471 w[19] = a(1,0)*w[9]-a(1,2)*w[2]+a(1,3)*w[1];
4472 w[20] = a(1,0)*w[10]-a(1,2)*w[3]+a(1,4)*w[1];
4473 w[21] = a(1,0)*w[11]-a(1,2)*w[4]+a(1,5)*w[1];
4474 w[22] = a(1,0)*w[12]-a(1,3)*w[3]+a(1,4)*w[2];
4475 w[23] = a(1,0)*w[13]-a(1,3)*w[4]+a(1,5)*w[2];
4476 w[24] = a(1,0)*w[14]-a(1,4)*w[4]+a(1,5)*w[3];
4477 w[25] = a(1,1)*w[9]-a(1,2)*w[6]+a(1,3)*w[5];
4478 w[26] = a(1,1)*w[10]-a(1,2)*w[7]+a(1,4)*w[5];
4479 w[27] = a(1,1)*w[11]-a(1,2)*w[8]+a(1,5)*w[5];
4480 w[28] = a(1,1)*w[12]-a(1,3)*w[7]+a(1,4)*w[6];
4481 w[29] = a(1,1)*w[13]-a(1,3)*w[8]+a(1,5)*w[6];
4482 w[30] = a(1,1)*w[14]-a(1,4)*w[8]+a(1,5)*w[7];
4483 w[31] = a(1,2)*w[12]-a(1,3)*w[10]+a(1,4)*w[9];
4484 w[32] = a(1,2)*w[13]-a(1,3)*w[11]+a(1,5)*w[9];
4485 w[33] = a(1,2)*w[14]-a(1,4)*w[11]+a(1,5)*w[10];
4486 w[34] = a(1,3)*w[14]-a(1,4)*w[13]+a(1,5)*w[12];
4487 w[ 0] = a(0,0)*w[25]-a(0,1)*w[19]+a(0,2)*w[16]-a(0,3)*w[15];
4488 w[ 1] = a(0,0)*w[26]-a(0,1)*w[20]+a(0,2)*w[17]-a(0,4)*w[15];
4489 w[ 2] = a(0,0)*w[27]-a(0,1)*w[21]+a(0,2)*w[18]-a(0,5)*w[15];
4490 w[ 3] = a(0,0)*w[28]-a(0,1)*w[22]+a(0,3)*w[17]-a(0,4)*w[16];
4491 w[ 4] = a(0,0)*w[29]-a(0,1)*w[23]+a(0,3)*w[18]-a(0,5)*w[16];
4492 w[ 5] = a(0,0)*w[30]-a(0,1)*w[24]+a(0,4)*w[18]-a(0,5)*w[17];
4493 w[ 6] = a(0,0)*w[31]-a(0,2)*w[22]+a(0,3)*w[20]-a(0,4)*w[19];
4494 w[ 7] = a(0,0)*w[32]-a(0,2)*w[23]+a(0,3)*w[21]-a(0,5)*w[19];
4495 w[ 8] = a(0,0)*w[33]-a(0,2)*w[24]+a(0,4)*w[21]-a(0,5)*w[20];
4496 w[ 9] = a(0,0)*w[34]-a(0,3)*w[24]+a(0,4)*w[23]-a(0,5)*w[22];
4497 w[10] = a(0,1)*w[31]-a(0,2)*w[28]+a(0,3)*w[26]-a(0,4)*w[25];
4498 w[11] = a(0,1)*w[32]-a(0,2)*w[29]+a(0,3)*w[27]-a(0,5)*w[25];
4499 w[12] = a(0,1)*w[33]-a(0,2)*w[30]+a(0,4)*w[27]-a(0,5)*w[26];
4500 w[13] = a(0,1)*w[34]-a(0,3)*w[30]+a(0,4)*w[29]-a(0,5)*w[28];
4501 w[14] = a(0,2)*w[34]-a(0,3)*w[33]+a(0,4)*w[32]-a(0,5)*w[31];
4502
4503 b(0,2) = a(3,1)*w[14]-a(3,2)*w[13]+a(3,3)*w[12]-a(3,4)*w[11]+a(3,5)*w[10];
4504 b(1,2) =-a(3,0)*w[14]+a(3,2)*w[9]-a(3,3)*w[8]+a(3,4)*w[7]-a(3,5)*w[6];
4505 b(2,2) = a(3,0)*w[13]-a(3,1)*w[9]+a(3,3)*w[5]-a(3,4)*w[4]+a(3,5)*w[3];
4506 b(3,2) =-a(3,0)*w[12]+a(3,1)*w[8]-a(3,2)*w[5]+a(3,4)*w[2]-a(3,5)*w[1];
4507 b(4,2) = a(3,0)*w[11]-a(3,1)*w[7]+a(3,2)*w[4]-a(3,3)*w[2]+a(3,5)*w[0];
4508 b(5,2) =-a(3,0)*w[10]+a(3,1)*w[6]-a(3,2)*w[3]+a(3,3)*w[1]-a(3,4)*w[0];
4509
4510 b(0,3) =-a(2,1)*w[14]+a(2,2)*w[13]-a(2,3)*w[12]+a(2,4)*w[11]-a(2,5)*w[10];
4511 b(1,3) = a(2,0)*w[14]-a(2,2)*w[9]+a(2,3)*w[8]-a(2,4)*w[7]+a(2,5)*w[6];
4512 b(2,3) =-a(2,0)*w[13]+a(2,1)*w[9]-a(2,3)*w[5]+a(2,4)*w[4]-a(2,5)*w[3];
4513 b(3,3) = a(2,0)*w[12]-a(2,1)*w[8]+a(2,2)*w[5]-a(2,4)*w[2]+a(2,5)*w[1];
4514 b(4,3) =-a(2,0)*w[11]+a(2,1)*w[7]-a(2,2)*w[4]+a(2,3)*w[2]-a(2,5)*w[0];
4515 b(5,3) = a(2,0)*w[10]-a(2,1)*w[6]+a(2,2)*w[3]-a(2,3)*w[1]+a(2,4)*w[0];
4516
4517 w[ 0] = a(2,0)*a(3,1)-a(2,1)*a(3,0);
4518 w[ 1] = a(2,0)*a(3,2)-a(2,2)*a(3,0);
4519 w[ 2] = a(2,0)*a(3,3)-a(2,3)*a(3,0);
4520 w[ 3] = a(2,0)*a(3,4)-a(2,4)*a(3,0);
4521 w[ 4] = a(2,0)*a(3,5)-a(2,5)*a(3,0);
4522 w[ 5] = a(2,1)*a(3,2)-a(2,2)*a(3,1);
4523 w[ 6] = a(2,1)*a(3,3)-a(2,3)*a(3,1);
4524 w[ 7] = a(2,1)*a(3,4)-a(2,4)*a(3,1);
4525 w[ 8] = a(2,1)*a(3,5)-a(2,5)*a(3,1);
4526 w[ 9] = a(2,2)*a(3,3)-a(2,3)*a(3,2);
4527 w[10] = a(2,2)*a(3,4)-a(2,4)*a(3,2);
4528 w[11] = a(2,2)*a(3,5)-a(2,5)*a(3,2);
4529 w[12] = a(2,3)*a(3,4)-a(2,4)*a(3,3);
4530 w[13] = a(2,3)*a(3,5)-a(2,5)*a(3,3);
4531 w[14] = a(2,4)*a(3,5)-a(2,5)*a(3,4);
4532 w[15] = a(1,0)*w[5]-a(1,1)*w[1]+a(1,2)*w[0];
4533 w[16] = a(1,0)*w[6]-a(1,1)*w[2]+a(1,3)*w[0];
4534 w[17] = a(1,0)*w[7]-a(1,1)*w[3]+a(1,4)*w[0];
4535 w[18] = a(1,0)*w[8]-a(1,1)*w[4]+a(1,5)*w[0];
4536 w[19] = a(1,0)*w[9]-a(1,2)*w[2]+a(1,3)*w[1];
4537 w[20] = a(1,0)*w[10]-a(1,2)*w[3]+a(1,4)*w[1];
4538 w[21] = a(1,0)*w[11]-a(1,2)*w[4]+a(1,5)*w[1];
4539 w[22] = a(1,0)*w[12]-a(1,3)*w[3]+a(1,4)*w[2];
4540 w[23] = a(1,0)*w[13]-a(1,3)*w[4]+a(1,5)*w[2];
4541 w[24] = a(1,0)*w[14]-a(1,4)*w[4]+a(1,5)*w[3];
4542 w[25] = a(1,1)*w[9]-a(1,2)*w[6]+a(1,3)*w[5];
4543 w[26] = a(1,1)*w[10]-a(1,2)*w[7]+a(1,4)*w[5];
4544 w[27] = a(1,1)*w[11]-a(1,2)*w[8]+a(1,5)*w[5];
4545 w[28] = a(1,1)*w[12]-a(1,3)*w[7]+a(1,4)*w[6];
4546 w[29] = a(1,1)*w[13]-a(1,3)*w[8]+a(1,5)*w[6];
4547 w[30] = a(1,1)*w[14]-a(1,4)*w[8]+a(1,5)*w[7];
4548 w[31] = a(1,2)*w[12]-a(1,3)*w[10]+a(1,4)*w[9];
4549 w[32] = a(1,2)*w[13]-a(1,3)*w[11]+a(1,5)*w[9];
4550 w[33] = a(1,2)*w[14]-a(1,4)*w[11]+a(1,5)*w[10];
4551 w[34] = a(1,3)*w[14]-a(1,4)*w[13]+a(1,5)*w[12];
4552
4553 w[ 0] = a(0,0)*w[25]-a(0,1)*w[19]+a(0,2)*w[16]-a(0,3)*w[15];
4554 w[ 1] = a(0,0)*w[26]-a(0,1)*w[20]+a(0,2)*w[17]-a(0,4)*w[15];
4555 w[ 2] = a(0,0)*w[27]-a(0,1)*w[21]+a(0,2)*w[18]-a(0,5)*w[15];
4556 w[ 3] = a(0,0)*w[28]-a(0,1)*w[22]+a(0,3)*w[17]-a(0,4)*w[16];
4557 w[ 4] = a(0,0)*w[29]-a(0,1)*w[23]+a(0,3)*w[18]-a(0,5)*w[16];
4558 w[ 5] = a(0,0)*w[30]-a(0,1)*w[24]+a(0,4)*w[18]-a(0,5)*w[17];
4559 w[ 6] = a(0,0)*w[31]-a(0,2)*w[22]+a(0,3)*w[20]-a(0,4)*w[19];
4560 w[ 7] = a(0,0)*w[32]-a(0,2)*w[23]+a(0,3)*w[21]-a(0,5)*w[19];
4561 w[ 8] = a(0,0)*w[33]-a(0,2)*w[24]+a(0,4)*w[21]-a(0,5)*w[20];
4562 w[ 9] = a(0,0)*w[34]-a(0,3)*w[24]+a(0,4)*w[23]-a(0,5)*w[22];
4563 w[10] = a(0,1)*w[31]-a(0,2)*w[28]+a(0,3)*w[26]-a(0,4)*w[25];
4564 w[11] = a(0,1)*w[32]-a(0,2)*w[29]+a(0,3)*w[27]-a(0,5)*w[25];
4565 w[12] = a(0,1)*w[33]-a(0,2)*w[30]+a(0,4)*w[27]-a(0,5)*w[26];
4566 w[13] = a(0,1)*w[34]-a(0,3)*w[30]+a(0,4)*w[29]-a(0,5)*w[28];
4567 w[14] = a(0,2)*w[34]-a(0,3)*w[33]+a(0,4)*w[32]-a(0,5)*w[31];
4568
4569 b(0,4) = a(5,1)*w[14]-a(5,2)*w[13]+a(5,3)*w[12]-a(5,4)*w[11]+a(5,5)*w[10];
4570 b(1,4) =-a(5,0)*w[14]+a(5,2)*w[9]-a(5,3)*w[8]+a(5,4)*w[7]-a(5,5)*w[6];
4571 b(2,4) = a(5,0)*w[13]-a(5,1)*w[9]+a(5,3)*w[5]-a(5,4)*w[4]+a(5,5)*w[3];
4572 b(3,4) =-a(5,0)*w[12]+a(5,1)*w[8]-a(5,2)*w[5]+a(5,4)*w[2]-a(5,5)*w[1];
4573 b(4,4) = a(5,0)*w[11]-a(5,1)*w[7]+a(5,2)*w[4]-a(5,3)*w[2]+a(5,5)*w[0];
4574 b(5,4) =-a(5,0)*w[10]+a(5,1)*w[6]-a(5,2)*w[3]+a(5,3)*w[1]-a(5,4)*w[0];
4575
4576 b(0,5) =-a(4,1)*w[14]+a(4,2)*w[13]-a(4,3)*w[12]+a(4,4)*w[11]-a(4,5)*w[10];
4577 b(1,5) = a(4,0)*w[14]-a(4,2)*w[9]+a(4,3)*w[8]-a(4,4)*w[7]+a(4,5)*w[6];
4578 b(2,5) =-a(4,0)*w[13]+a(4,1)*w[9]-a(4,3)*w[5]+a(4,4)*w[4]-a(4,5)*w[3];
4579 b(3,5) = a(4,0)*w[12]-a(4,1)*w[8]+a(4,2)*w[5]-a(4,4)*w[2]+a(4,5)*w[1];
4580 b(4,5) =-a(4,0)*w[11]+a(4,1)*w[7]-a(4,2)*w[4]+a(4,3)*w[2]-a(4,5)*w[0];
4581 b(5,5) = a(4,0)*w[10]-a(4,1)*w[6]+a(4,2)*w[3]-a(4,3)*w[1]+a(4,4)*w[0];
4582 }
4583 };
4584 } // namespace Intern
4586 } // namespace Tiny
4587} // namespace FEAT
#define XABORTM(msg)
Abortion macro definition with custom message.
Definition: assertion.hpp:192
#define ASSERTM(expr, msg)
Debug-Assertion macro definition with custom message.
Definition: assertion.hpp:230
#define XASSERTM(expr, msg)
Assertion macro definition with custom message.
Definition: assertion.hpp:263
FEAT Kernel base header.
Math Limits class template.
Definition: math.hpp:1475
Tiny Matrix class template.
CUDA_HOST_DEVICE DataType det() const
Returns the determinant of the matrix.
CUDA_HOST_DEVICE Matrix(DataType value)
value-assignment constructor
CUDA_HOST_DEVICE Matrix & set_cofactor(const Matrix< T_, m_, n_, sma_, sna_ > &a)
Sets this matrix to the cofactor matrix of another matrix.
CUDA_HOST_DEVICE Matrix & add_scalar_main_diag(DataType alpha)
Adds a value onto the matrix's main diagonal.
CUDA_HOST_DEVICE Matrix & add_outer_product(const Vector< T_, m_, snx_ > &x, const Vector< T_, n_, sny_ > &y, const DataType alpha=DataType(1))
Adds the outer product of two vectors onto the matrix.
CUDA_HOST_DEVICE Matrix & add_mat_mat_mult(const Matrix< T_, m_, la_, sma_, sna_ > &a, const Matrix< T_, lb_, n_, smb_, snb_ > &b, DataType alpha=DataType(1))
Adds the algebraic matrix-product of two other matrices onto this matrix.
CUDA_HOST_DEVICE Matrix & set_mat_mat_mult(const Matrix< T_, m_, la_, sma_, sna_ > &a, const Matrix< T_, lb_, n_, smb_, snb_ > &b)
Sets this matrix to the algebraic matrix-product of two other matrices.
CUDA_HOST_DEVICE DataType norm_frobenius_sqr() const
Returns the Frobenius norm squared of the matrix.
T_ ValueType
the data type of the matrix
CUDA_HOST_DEVICE Matrix & operator=(const Matrix< T_, m_, n_, sma_, sna_ > &a)
assignment operator
CUDA_HOST_DEVICE Matrix(const std::initializer_list< std::initializer_list< Tx_ > > &x)
Initializer list constructor.
CUDA_HOST_DEVICE Matrix & add_vec_tensor_mult(const Vector< T_, l_, snv_ > &x, const Tensor3< T_, l_, m_, n_, slt_, smt_, snt_ > &t, DataType alpha=DataType(1))
Adds the result of a vector-tensor left-product onto this matrix.
CUDA_HOST_DEVICE Matrix & set_transpose(const Matrix< T_, n_, m_, sma_, sna_ > &a)
Sets this matrix to the transpose of another matrix.
CUDA_HOST friend std::istream & operator>>(std::istream &in, Matrix &A)
Tiny::Matrix streaming operator.
CUDA_HOST_DEVICE Matrix & axpy(DataType alpha, const Matrix< T_, m_, n_, sma_, sna_ > &a)
Adds another scaled matrix onto this matrix.
CUDA_HOST_DEVICE Matrix & operator-=(const Matrix< T_, m_, n_, sma_, sna_ > &a)
matrix component-wise subtraction operator
CUDA_HOST_DEVICE Matrix(const Matrix< T2_, m_, n_, sma_, sna_ > &a)
copy constructor
static CUDA_HOST_DEVICE Matrix null()
Returns a null-matrix.
CUDA_HOST_DEVICE DataType norm_sub_id_frobenius() const
Returns the Frobenius norm of the difference of this matrix and the identity matrix.
CUDA_HOST_DEVICE Matrix & set_inverse(const Matrix< T_, m_, n_, sma_, sna_ > &a)
Sets this matrix to the inverse of another matrix.
CUDA_HOST_DEVICE const RowType & operator[](int i) const
Row-Access operator.
CUDA_HOST_DEVICE Matrix & operator*=(DataType alpha)
scalar-right-multiply-by operator
CUDA_HOST_DEVICE DataType scalar_product(const Vector< T_, m_, snx_ > &x, const Vector< T_, n_, sny_ > &y) const
Computes the scalar product of two vectors with this matrix.
Intern::DataTypeExtractor< ValueType >::MyDataType DataType
The basic data type buried in the lowest level of the vector.
Vector< T_, n_, sn_ > RowType
the type of a single matrix row
CUDA_HOST_DEVICE const T_ & operator()(int i, int j) const
Access operator.
CUDA_HOST_DEVICE Matrix & operator=(DataType value)
value-assignment operator
CUDA_HOST_DEVICE Matrix & set_rotation_3d(T_ yaw, T_ pitch, T_ roll)
Sets this matrix to a 3D yaw-pitch-roll rotation matrix.
CUDA_HOST_DEVICE DataType trace() const
Returns the trace of the matrix.
CUDA_HOST_DEVICE DataType norm_hessian_sqr() const
Returns the Hessian norm square.
CUDA_HOST_DEVICE Matrix(const std::initializer_list< Tx_ > &x)
Initializer list of Tiny::Vector constructor.
CUDA_HOST_DEVICE Matrix & set_gram(const Matrix< T_, l_, n_, sla_, sna_ > &a)
Sets this matrix to the Gram matrix of another matrix.
CUDA_HOST_DEVICE Matrix & set_rotation_2d(T_ angle)
Sets this matrix to a 2D rotation matrix.
CUDA_HOST_DEVICE Matrix & set_double_mat_mult(const Matrix< T_, k_, l_, sma_, sna_ > &a, const Matrix< T_, k_, m_, smb_, snb_ > &b, const Matrix< T_, l_, n_, smd_, snd_ > &d, T_ alpha=T_(1))
Sets this matrix to the algebraic matrix double-product of three other matrices.
CUDA_HOST_DEVICE T_ & operator()(int i, int j)
Access operator.
CUDA_HOST_DEVICE Matrix & add_double_mat_mult(const Matrix< T_, k_, l_, sma_, sna_ > &a, const Matrix< T_, k_, m_, smb_, snb_ > &b, const Matrix< T_, l_, n_, smd_, snd_ > &d, DataType alpha=DataType(1))
Adds the algebraic matrix double-product of three other matrices onto this matrix.
CUDA_HOST_DEVICE void convert(const Matrix< Tx_, m_, n_, sma_, sna_ > &a)
conversion operator
CUDA_HOST_DEVICE Matrix & set_identity()
Sets this matrix to the identity matrix.
CUDA_HOST_DEVICE Matrix & operator+=(const Matrix< T_, m_, n_, sma_, sna_ > &a)
matrix component-wise addition operator
CUDA_HOST_DEVICE RowType & operator[](int i)
Row-Access operator.
CUDA_HOST_DEVICE Matrix & operator=(const std::initializer_list< std::initializer_list< Tx_ > > &x)
Initializer list assignment operator.
CUDA_HOST_DEVICE Matrix()
default constructor
CUDA_HOST_DEVICE void copy_n(const Matrix< T_, ma_, na_, sma_, sna_ > &a)
Copies the upper left mm_ x nn_ entries of a matrix.
CUDA_HOST_DEVICE DataType vol() const
Returns the volume of the matrix.
RowType v[sm_]
actual matrix data; that's an array of vectors
CUDA_HOST_DEVICE Matrix & operator=(const std::initializer_list< Tx_ > &x)
Initializer list assignment operator.
CUDA_HOST_DEVICE Matrix & set_vec_tensor_mult(const Vector< T_, l_, snv_ > &x, const Tensor3< T_, l_, m_, n_, slt_, smt_, snt_ > &t, DataType alpha=DataType(1))
Sets this matrix to the result of a vector-tensor left-product.
CUDA_HOST_DEVICE Matrix & set_outer_product(const Vector< T_, m_, snx_ > &x, const Vector< T_, n_, sny_ > &y)
Sets this matrix to the outer product of two vectors.
CUDA_HOST_DEVICE void format(DataType alpha=DataType(0))
Formats the matrix.
CUDA_HOST_DEVICE void copy(const Matrix< T_, m_, n_, sma_, sna_ > &a)
Copies a matrix.
CUDA_HOST_DEVICE DataType norm_frobenius() const
Returns the Frobenius norm of the matrix.
Tiny Tensor3 class template.
CUDA_HOST_DEVICE Tensor3 & add_double_mat_mult(const Tensor3< T_, lt_, mt_, nt_, slt_, smt_, snt_ > &t, const Matrix< T_, nt_, n_, smb_, snb_ > &b, const Matrix< T_, mt_, m_, smd_, snd_ > &d, DataType alpha=DataType(1))
Adds the result of a matrix-tensor-matrix double-product onto this tensor.
Intern::DataTypeExtractor< ValueType >::MyDataType DataType
The basic data type buried in the lowest level of the vector.
CUDA_HOST_DEVICE void copy_n(const Tensor3< T_, la_, ma_, na_, sla_, sma_, sna_ > &a)
Copies the upper left mm_ x nn_ entries of a matrix.
static constexpr int l
the tube count of the tensor
CUDA_HOST_DEVICE Tensor3 & operator=(const std::initializer_list< Tx_ > &x)
Initializer list assignment operator.
static constexpr int sn
the column stride of the tensor
CUDA_HOST_DEVICE Tensor3 & operator=(const Tensor3< T_, l_, m_, n_, sla_, sma_, sna_ > &a)
copy-assignment operator
CUDA_HOST_DEVICE void format(DataType alpha=DataType(0))
formats the tensor
CUDA_HOST_DEVICE const T_ & operator()(int h, int i, int j) const
Access operator.
CUDA_HOST_DEVICE Tensor3 & operator=(DataType value)
value-assignment operator
CUDA_HOST_DEVICE Tensor3 & operator=(const std::initializer_list< std::initializer_list< std::initializer_list< Tx_ > > > &x)
Initializer list assignment operator.
CUDA_HOST_DEVICE Tensor3 & add_vec_mat_outer_product(const Vector< T_, l_, slx_ > &x, const Matrix< T_, m_, n_, sma_, sna_ > &a, DataType alpha=DataType(1))
Adds the result of a vector-matrix outer product onto this tensor.
CUDA_HOST_DEVICE Tensor3 & operator+=(const Tensor3< T_, l_, m_, n_, sla_, sma_, sna_ > &a)
tensor component-wise addition operator
CUDA_HOST_DEVICE Tensor3(const std::initializer_list< std::initializer_list< std::initializer_list< Tx_ > > > &x)
Initializer list constructor.
CUDA_HOST_DEVICE const PlaneType & operator[](int h) const
Plane-Access operator.
CUDA_HOST_DEVICE Tensor3(const Tensor3< T_, l_, m_, n_, sla_, sma_, sna_ > &a)
copy-constructor
Matrix< T_, m_, n_, sm_, sn_ > PlaneType
Type of tensor data; that's an array of matrices.
static constexpr int sl
the tube stride of the tensor
CUDA_HOST_DEVICE Tensor3(const std::initializer_list< Tx_ > &x)
Initializer list constructor.
CUDA_HOST_DEVICE PlaneType & operator[](int h)
Plane-Access operator.
CUDA_HOST_DEVICE Tensor3(DataType value)
value-assignment constructor
static CUDA_HOST_DEVICE Tensor3 null()
Returns a null-tensor.
CUDA_HOST_DEVICE Tensor3 & add_mat_tensor_mult(const Matrix< T_, l_, k_, sma_, sna_ > &a, const Tensor3< T_, k_, m_, n_, slt_, smt_, snt_ > &t, DataType alpha=DataType(1))
Adds the result of a matrix-tensor product onto this tensor.
PlaneType v[sl_]
Actual tensor data.
static constexpr int m
the row count of the tensor
CUDA_HOST_DEVICE void convert(const Tensor3< Tx_, l_, m_, n_, sla_, sma_, sna_ > &a)
conversion operator
CUDA_HOST_DEVICE Tensor3()
default constructor
CUDA_HOST_DEVICE Tensor3 & operator-=(const Tensor3< T_, l_, m_, n_, sla_, sma_, sna_ > &a)
tensor component-wise subtraction operator
CUDA_HOST_DEVICE T_ & operator()(int h, int i, int j)
Access operator.
static constexpr int n
the column count of the tensor
static constexpr int sm
the row stride of the tensor
T_ ValueType
the data type of the tensor
CUDA_HOST_DEVICE Tensor3 & operator*=(DataType alpha)
scalar right-multiply-by operator
CUDA_HOST_DEVICE void copy(const Tensor3< T_, l_, m_, n_, sla_, sma_, sna_ > &a)
Copies a tensor3.
Tiny Vector class template.
static CUDA_HOST_DEVICE Vector null()
Returns a null-vector.
CUDA_HOST_DEVICE DataType norm_euclid_n() const
Computes the euclid norm of first nn_ entries of this vector.
CUDA_HOST_DEVICE Vector & add_mat_vec_mult(const Matrix< T_, n_, m_, sma_, sna_ > &a, const Vector< T_, m_, sx_ > &x, DataType alpha=DataType(1))
Adds the result of a matrix-vector product onto this vector.
CUDA_HOST_DEVICE DataType norm_euclid_sqr_n() const
Computes the squared euclid norm of first nn_ entries of this vector.
CUDA_HOST_DEVICE Vector & normalize_n()
Normalizes the first nn_ entries of this vector.
CUDA_HOST_DEVICE Vector & operator=(const Vector< T_, n_, sx_ > &x)
copy-assignment operator
T_ v[s_]
actual vector data
CUDA_HOST_DEVICE bool normalized() const
Return if the vector is normalized.
CUDA_HOST_DEVICE void format_n(DataType alpha=DataType(0))
Formats the first nn_ entries of the vector.
static CUDA_HOST_DEVICE Vector convert_new(Vector &&x)
overload for moveable rvalue type
CUDA_HOST_DEVICE void format(DataType alpha=DataType(0))
Formats the vector.
CUDA_HOST_DEVICE Vector(const Vector< T_, n_, sx_ > &x)
copy constructor
CUDA_HOST_DEVICE Vector & operator=(const std::initializer_list< Tx_ > &x)
Initializer list assignment operator.
CUDA_HOST_DEVICE DataType norm_max() const
Computes the max-norm of this vector.
CUDA_HOST_DEVICE T_ & operator()(int i)
Access operator.
CUDA_HOST_DEVICE const T_ & operator[](int i) const
Access operator.
CUDA_HOST_DEVICE Vector & operator-=(const Vector< T_, n_, sx_ > &x)
vector-subtract operator
CUDA_HOST_DEVICE Vector & set_vec_mat_mult(const Vector< T_, m_, sx_ > &x, const Matrix< T_, m_, n_, sma_, sna_ > &a)
Sets this vector to the result of a vector-matrix product.
static CUDA_HOST_DEVICE Vector convert_new(const Vector< Tx_, n_, sx_ > &x)
convert function, not callable with non convertable inner type
CUDA_HOST_DEVICE Vector & add_vec_mat_mult_n(const Vector< T_, mx_, smx_ > &x, const Matrix< T_, ma_, na_, sma_, sna_ > &a, DataType alpha=DataType(1))
Adds the result of a vector-matrix product onto this vector.
CUDA_HOST_DEVICE Vector(const Vector< Tx_, n_, sx_ > &x)
copy constructor
CUDA_HOST_DEVICE void scale_n(DataType alpha)
Scales the first nn_ entries of the vector.
CUDA_HOST_DEVICE Vector & set_mat_vec_mult_n(const Matrix< T_, ma_, na_, sma_, sna_ > &a, const Vector< T_, nx_, sx_ > &x)
Sets the first nn_ entries of this vector to the result of a matrix-vector product with the first mm_...
static constexpr int n
the length of the vector
CUDA_HOST_DEVICE Vector & operator*=(DataType alpha)
scalar-multiply operator
CUDA_HOST_DEVICE Vector()
default constructor
CUDA_HOST_DEVICE DataType norm_l1() const
Computes the l1-norm of this vector.
CUDA_HOST_DEVICE Vector & set_convex(DataType alpha, const Vector< T_, n_, sna_ > &a, const Vector< T_, n_, snb_ > &b)
Sets this vector to the convex combination of two other vectors.
CUDA_HOST_DEVICE Vector & axpy_n(DataType alpha, const Vector< T_, nx_, snx_ > &x)
Adds the first nn_ entries of another scaled vector onto this vector.
CUDA_HOST_DEVICE Vector & add_mat_vec_mult_n(const Matrix< T_, ma_, na_, sma_, sna_ > &a, const Vector< T_, nx_, sx_ > &x, DataType alpha=DataType(1))
Adds the result of a matrix-vector product onto this vector.
CUDA_HOST friend std::ostream & operator<<(std::ostream &lhs, const Vector &b)
Tiny::Vector streaming operator.
CUDA_HOST_DEVICE Vector & operator+=(const Vector< T_, n_, sx_ > &x)
vector-add operator
CUDA_HOST_DEVICE Vector & operator=(DataType value)
value-assignment operator
CUDA_HOST_DEVICE Vector & operator=(const Vector< Tx_, n_, sx_ > &x)
copy-assignment operator
CUDA_HOST_DEVICE const T_ & operator()(int i) const
Access operator.
CUDA_HOST_DEVICE Vector & axpy(DataType alpha, const Vector< T_, n_, snx_ > &x)
Adds another scaled vector onto this vector.
CUDA_HOST_DEVICE void copy(const Vector< T_, n_, snx_ > &x)
Copies a vector.
CUDA_HOST_DEVICE Vector(const std::array< Tx_, n_ > &x)
copy constructor
CUDA_HOST_DEVICE Vector & normalize()
Normalizes this vector.
CUDA_HOST_DEVICE Vector & operator=(const std::array< Tx_, n_ > &x)
copy-assignment operator
CUDA_HOST_DEVICE Vector & negate()
Negates the vector, i.e. effectively multiplies all components by -1.
Intern::DataTypeExtractor< ValueType >::MyDataType DataType
The basic data type buried in the lowest level of the vector.
CUDA_HOST_DEVICE Vector(DataType value)
value-assignment constructor
CUDA_HOST_DEVICE DataType norm_max_n() const
Computes the max-norm of the vector.
CUDA_HOST_DEVICE Vector & negate_n()
Negates the first nn_ entries of this vector.
CUDA_HOST_DEVICE DataType norm_l1_n() const
Computes the l1-norm of the first nn_ entries of this vector.
CUDA_HOST_DEVICE Vector & set_convex_n(DataType alpha, const Vector< T_, na_, sna_ > &a, const Vector< T_, nb_, snb_ > &b)
Sets the first nn_ entries of this vector to the convex combination of two other vectors.
CUDA_HOST_DEVICE DataType norm_euclid_sqr() const
Computes the squared euclid norm of this vector.
CUDA_HOST_DEVICE void copy_n(const Vector< T_, nx_, snx_ > &x)
Copies the first nn_ entries of a vector.
CUDA_HOST_DEVICE void scale(DataType alpha)
Scales the vector.
CUDA_HOST_DEVICE Vector(const std::initializer_list< Tx_ > &x)
Initializer list constructor.
static constexpr int s
the stride of the vector
CUDA_HOST_DEVICE Vector & set_mat_vec_mult(const Matrix< T_, n_, m_, sma_, sna_ > &a, const Vector< T_, m_, sx_ > &x)
Sets this vector to the result of a matrix-vector product.
CUDA_HOST_DEVICE Vector & add_vec_mat_mult(const Vector< T_, m_, sx_ > &x, const Matrix< T_, m_, n_, sma_, sna_ > &a, DataType alpha=DataType(1))
Adds the result of a vector-matrix product onto this vector.
CUDA_HOST_DEVICE T_ & operator[](int i)
Access operator.
T_ ValueType
the value type of the vector
CUDA_HOST_DEVICE DataType norm_euclid() const
Computes the euclid norm of this vector.
CUDA_HOST_DEVICE Vector & set_vec_mat_mult_n(const Vector< T_, mx_, smx_ > &x, const Matrix< T_, ma_, na_, sma_, sna_ > &a)
Sets the first mm_ entries of this vector to the result of a vector-matrix product with the first mm_...
CUDA_HOST_DEVICE void convert(const Vector< Tx_, n_, sx_ > &x)
conversion operator
CUDA_HOST_DEVICE Vector & operator*=(const Vector< T_, n_, sx_ > &x)
element-wise-multiply operator
T_ sqrt(T_ x)
Returns the square-root of a value.
Definition: math.hpp:300
T_ abs(T_ x)
Returns the absolute value.
Definition: math.hpp:275
DT_ calc_opening_angle(DT_ x1, DT_ x2, DT_ y1, DT_ y2)
Calculates the opening angle of two 2D vectors.
Definition: math.hpp:1450
T_ pow(T_ x, T_ y)
Returns x raised to the power of y.
Definition: math.hpp:643
T_ sin(T_ x)
Returns the sine of a value.
Definition: math.hpp:344
T_ sqr(T_ x)
Returns the square of a value.
Definition: math.hpp:95
T_ min(T_ a, T_ b)
Returns the minimum of two values.
Definition: math.hpp:123
T_ cos(T_ x)
Returns the cosine of a value.
Definition: math.hpp:386
T_ max(T_ a, T_ b)
Returns the maximum of two values.
Definition: math.hpp:137
CUDA_HOST_DEVICE Vector< T_, n_ > operator+(const Vector< T_, n_, sa_ > &a, const Vector< T_, n_, sb_ > &b)
vector addition operator
CUDA_HOST Vector< T_, dim_ > project_onto(const Vector< T_, dim_ > &x, const Vector< T_, dim_ > &y)
Calculates the projected vector.
CUDA_HOST_DEVICE T_ dot(const T_ &a, const T_ &b)
Computes the dot-product of two scalars.
CUDA_HOST_DEVICE void orthogonal_3x2(Vector< T_, mx_, smx_ > &nu, const Matrix< T_, ma_, na_, sm_, sn_ > &tau)
Computes the positively oriented orthogonal vector to the columns of a 3x2 matrix.
CUDA_HOST_DEVICE void axpy(T_ &y, const T_ &x, const T_ &alpha)
Performs an AXPY of two scalars.
CUDA_HOST_DEVICE void orthogonal_2x1(Vector< T_, mx_, smx_ > &nu, const Matrix< T_, ma_, na_, sm_, sn_ > &tau)
Computes the positively oriented orthogonal vector to the columns of a 2x1 matrix.
CUDA_HOST_DEVICE void add_id(T_ &x, const T_ &alpha)
Adds a scaled identity onto a scalar.
Vector< T_, m_ > orthogonal(const Matrix< T_, m_, m_-1, sm_, sn_ > &tau)
Computes the positively oriented orthogonal vector to the columns of a m_ x (m_-1) Matrix.
CUDA_HOST_DEVICE Vector< T_, n_ > operator-(const Vector< T_, n_, sa_ > &a, const Vector< T_, n_, sb_ > &b)
vector subtraction operator
CUDA_HOST_DEVICE Vector< T_, n_ > component_product(const Vector< T_, n_, sa_ > &a, const Vector< T_, n_, sb_ > &b)
vector element-wise-product operator
CUDA_HOST_DEVICE Vector< T_, n_ > operator*(typename Vector< T_, n_ >::DataType alpha, const Vector< T_, n_, s_ > &x)
scalar-left-multiplication operator
CUDA_HOST T_ calculate_opening_angle(const Vector< T_, 2 > &x, const Vector< T_, 2 > &y)
Calculates the counter-clockwise opening angle between two 2D vectors.
FEAT namespace.
Definition: adjactor.hpp:12
String stringify(const T_ &item)
Converts an item into a String.
Definition: string.hpp:993
@ value
specifies whether the space should supply basis function values