gl_mat.hh 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507
  1. #ifndef GL_MAT_HH
  2. #define GL_MAT_HH
  3. // This code implements a mathematical matrix, comparable in functionality
  4. // and syntax to the matrix types in GLSL.
  5. //
  6. // The code was tuned for 4x4 matrixes, but when it didn't complicate the
  7. // code, other sizes (even non-square) are supported as well.
  8. //
  9. // The functionality of this code is focused on openGL(ES). So only the
  10. // operation 'matrix x column-vector' is supported and not e.g. 'row-vector x
  11. // matrix'. The internal layout is also compatible with openGL(ES).
  12. //
  13. // This code itself doesn't have many SSE optimizations, but because it is built
  14. // on top of an optimized vector type the generated code for 4x4 matrix
  15. // multiplication is as efficient as a hand-written SSE assembly routine
  16. // (verified with gcc-4.8 on x86_64). The efficiency for other matrix sizes was
  17. // not a goal for this code (smaller sizes might be ok, bigger sizes most
  18. // likely not).
  19. #include "gl_vec.hh"
  20. #include <cassert>
  21. namespace gl {
  22. // Matrix with M rows and N columns (M by N), elements have type T.
  23. // Internally elements are stored in column-major order. This is the same
  24. // convention as openGL (and Fortran), but different from the typical row-major
  25. // order in C++. This allows to directly pass these matrices to openGL(ES).
  26. template<int M, int N, typename T> class matMxN
  27. {
  28. public:
  29. // Default copy-constructor and assignment operator.
  30. // Construct identity matrix.
  31. matMxN()
  32. {
  33. for (int i = 0; i < N; ++i) {
  34. for (int j = 0; j < M; ++j) {
  35. c[i][j] = (i == j) ? T(1) : T(0);
  36. }
  37. }
  38. }
  39. // Construct diagonal matrix.
  40. explicit matMxN(const vecN<(M < N ? M : N), T>& d)
  41. {
  42. for (int i = 0; i < N; ++i) {
  43. for (int j = 0; j < M; ++j) {
  44. c[i][j] = (i == j) ? d[i] : T(0);
  45. }
  46. }
  47. }
  48. // Construct from larger matrix (higher order elements are dropped).
  49. template<int M2, int N2> explicit matMxN(const matMxN<M2, N2, T>& x)
  50. {
  51. static_assert(((M2 > M) && (N2 >= N)) || ((M2 >= M) && (N2 > N)),
  52. "matrix must have strictly larger dimensions");
  53. for (int i = 0; i < N; ++i) c[i] = vecN<M, T>(x[i]);
  54. }
  55. // Construct matrix from 2 given columns (only valid when N == 2).
  56. matMxN(const vecN<M, T>& x, const vecN<M, T>& y)
  57. {
  58. static_assert(N == 2, "wrong #constructor arguments");
  59. c[0] = x; c[1] = y;
  60. }
  61. // Construct matrix from 3 given columns (only valid when N == 3).
  62. matMxN(const vecN<M, T>& x, const vecN<M, T>& y, const vecN<M, T>& z)
  63. {
  64. static_assert(N == 3, "wrong #constructor arguments");
  65. c[0] = x; c[1] = y; c[2] = z;
  66. }
  67. // Construct matrix from 4 given columns (only valid when N == 4).
  68. matMxN(const vecN<M, T>& x, const vecN<M, T>& y,
  69. const vecN<M, T>& z, const vecN<M, T>& w)
  70. {
  71. static_assert(N == 4, "wrong #constructor arguments");
  72. c[0] = x; c[1] = y; c[2] = z; c[3] = w;
  73. }
  74. // Access the i-the column of the matrix.
  75. // Vectors are also indexable, so 'A[i][j]' returns the element
  76. // at the i-th column, j-th row.
  77. const vecN<M, T>& operator[](unsigned i) const {
  78. #ifdef DEBUG
  79. assert(i < N);
  80. #endif
  81. return c[i];
  82. }
  83. vecN<M, T>& operator[](unsigned i) {
  84. #ifdef DEBUG
  85. assert(i < N);
  86. #endif
  87. return c[i];
  88. }
  89. // Assignment version of the +,-,* operations defined below.
  90. matMxN& operator+=(const matMxN& x) { *this = *this + x; return *this; }
  91. matMxN& operator-=(const matMxN& x) { *this = *this - x; return *this; }
  92. matMxN& operator*=(T x) { *this = *this * x; return *this; }
  93. matMxN& operator*=(const matMxN<N, N, T>& x) { *this = *this * x; return *this; }
  94. private:
  95. vecN<M, T> c[N];
  96. };
  97. // Specialization for 4x4 matrix.
  98. // This specialization is not needed for correctness, but it does help gcc-4.8
  99. // to generate better code for the constructors (clang-3.4 doesn't need help).
  100. // Maybe remove this specialization in the future?
  101. template<typename T> class matMxN<4, 4, T>
  102. {
  103. public:
  104. matMxN()
  105. {
  106. T one(1); T zero(0);
  107. c[0] = vecN<4,T>( one, zero, zero, zero);
  108. c[1] = vecN<4,T>(zero, one, zero, zero);
  109. c[2] = vecN<4,T>(zero, zero, one, zero);
  110. c[3] = vecN<4,T>(zero, zero, zero, one);
  111. }
  112. explicit matMxN(vecN<4, T> d)
  113. {
  114. T zero(0);
  115. c[0] = vecN<4,T>(d[0], zero, zero, zero);
  116. c[1] = vecN<4,T>(zero, d[1], zero, zero);
  117. c[2] = vecN<4,T>(zero, zero, d[2], zero);
  118. c[3] = vecN<4,T>(zero, zero, zero, d[3]);
  119. }
  120. matMxN(vecN<4, T> x, vecN<4, T> y, vecN<4, T> z, vecN<4, T> w)
  121. {
  122. c[0] = x; c[1] = y; c[2] = z; c[3] = w;
  123. }
  124. vecN<4, T> operator[](int i) const { return c[i]; }
  125. vecN<4, T>& operator[](int i) { return c[i]; }
  126. // Assignment version of the +,-,* operations defined below.
  127. matMxN& operator+=(const matMxN& x) { *this = *this + x; return *this; }
  128. matMxN& operator-=(const matMxN& x) { *this = *this - x; return *this; }
  129. matMxN& operator*=(T x) { *this = *this * x; return *this; }
  130. matMxN& operator*=(const matMxN& x) { *this = *this * x; return *this; }
  131. private:
  132. vecN<4, T> c[4];
  133. };
  134. // Convenience typedefs (same names as used by GLSL).
  135. using mat2 = matMxN<2, 2, float>;
  136. using mat3 = matMxN<3, 3, float>;
  137. using mat4 = matMxN<4, 4, float>;
  138. // -- Matrix functions --
  139. // matrix equality / inequality
  140. template<int M, int N, typename T>
  141. inline bool operator==(const matMxN<M, N, T>& A, const matMxN<M, N, T>& B)
  142. {
  143. for (int i = 0; i < N; ++i) if (A[i] != B[i]) return false;
  144. return true;
  145. }
  146. template<int M, int N, typename T>
  147. inline bool operator!=(const matMxN<M, N, T>& A, const matMxN<M, N, T>& B)
  148. {
  149. return !(A == B);
  150. }
  151. // matrix + matrix
  152. template<int M, int N, typename T>
  153. inline matMxN<M, N, T> operator+(const matMxN<M, N, T>& A, const matMxN<M, N, T>& B)
  154. {
  155. matMxN<M, N, T> R;
  156. for (int i = 0; i < N; ++i) R[i] = A[i] + B[i];
  157. return R;
  158. }
  159. // matrix - matrix
  160. template<int M, int N, typename T>
  161. inline matMxN<M, N, T> operator-(const matMxN<M, N, T>& A, const matMxN<M, N, T>& B)
  162. {
  163. matMxN<M, N, T> R;
  164. for (int i = 0; i < N; ++i) R[i] = A[i] - B[i];
  165. return R;
  166. }
  167. // matrix negation
  168. template<int M, int N, typename T>
  169. inline matMxN<M, N, T> operator-(const matMxN<M, N, T>& A)
  170. {
  171. return matMxN<M, N, T>(vecN<(M < N ? M : N), T>()) - A;
  172. }
  173. // scalar * matrix
  174. template<int M, int N, typename T>
  175. inline matMxN<M, N, T> operator*(T x, const matMxN<M, N, T>& A)
  176. {
  177. matMxN<M, N, T> R;
  178. for (int i = 0; i < N; ++i) R[i] = x * A[i];
  179. return R;
  180. }
  181. // matrix * scalar
  182. template<int M, int N, typename T>
  183. inline matMxN<M, N, T> operator*(const matMxN<M, N, T>& A, T x)
  184. {
  185. matMxN<M, N, T> R;
  186. for (int i = 0; i < N; ++i) R[i] = A[i] * x;
  187. return R;
  188. }
  189. // matrix * column-vector
  190. template<int M, int N, typename T>
  191. inline vecN<M, T> operator*(const matMxN<M, N, T>& A, const vecN<N, T>& x)
  192. {
  193. vecN<M, T> r;
  194. for (int i = 0; i < N; ++i) r += A[i] * x[i];
  195. return r;
  196. }
  197. template<typename T>
  198. inline vecN<4, T> operator*(const matMxN<4, 4, T>& A, vecN<4, T> x)
  199. {
  200. // Small optimization for 4x4 matrix: reassociate add-chain
  201. return (A[0] * x[0] + A[1] * x[1]) + (A[2] * x[2] + A[3] * x[3]);
  202. }
  203. // matrix * matrix
  204. template<int M, int N, int O, typename T>
  205. inline matMxN<M, O, T> operator*(const matMxN<M, N, T>& A, const matMxN<N, O, T>& B)
  206. {
  207. matMxN<M, O, T> R;
  208. for (int i = 0; i < O; ++i) R[i] = A * B[i];
  209. return R;
  210. }
  211. // matrix transpose
  212. template<int M, int N, typename T>
  213. inline matMxN<N, M, T> transpose(const matMxN<M, N, T>& A)
  214. {
  215. matMxN<N, M, T> R;
  216. for (int i = 0; i < N; ++i) {
  217. for (int j = 0; j < M; ++j) {
  218. R[j][i] = A[i][j];
  219. }
  220. }
  221. return R;
  222. }
  223. // determinant of a 2x2 matrix
  224. template<typename T>
  225. inline T determinant(const matMxN<2, 2, T>& A)
  226. {
  227. return A[0][0] * A[1][1] - A[0][1] * A[1][0];
  228. }
  229. // determinant of a 3x3 matrix
  230. template<typename T>
  231. inline T determinant(const matMxN<3, 3, T>& A)
  232. {
  233. return A[0][0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1])
  234. - A[1][0] * (A[0][1] * A[2][2] - A[0][2] * A[2][1])
  235. + A[2][0] * (A[0][1] * A[1][2] - A[0][2] * A[1][1]);
  236. }
  237. // determinant of a 4x4 matrix
  238. template<typename T>
  239. inline T determinant(const matMxN<4, 4, T>& A)
  240. {
  241. // Implementation based on glm: http://glm.g-truc.net
  242. T f0 = A[2][2] * A[3][3] - A[3][2] * A[2][3];
  243. T f1 = A[2][1] * A[3][3] - A[3][1] * A[2][3];
  244. T f2 = A[2][1] * A[3][2] - A[3][1] * A[2][2];
  245. T f3 = A[2][0] * A[3][3] - A[3][0] * A[2][3];
  246. T f4 = A[2][0] * A[3][2] - A[3][0] * A[2][2];
  247. T f5 = A[2][0] * A[3][1] - A[3][0] * A[2][1];
  248. vecN<4, T> c((A[1][1] * f0 - A[1][2] * f1 + A[1][3] * f2),
  249. (A[1][2] * f3 - A[1][3] * f4 - A[1][0] * f0),
  250. (A[1][0] * f1 - A[1][1] * f3 + A[1][3] * f5),
  251. (A[1][1] * f4 - A[1][2] * f5 - A[1][0] * f2));
  252. return dot(A[0], c);
  253. }
  254. // inverse of a 2x2 matrix
  255. template<typename T>
  256. inline matMxN<2, 2, T> inverse(const matMxN<2, 2, T>& A)
  257. {
  258. T d = T(1) / determinant(A);
  259. return d * matMxN<2, 2, T>(vecN<2, T>( A[1][1], -A[0][1]),
  260. vecN<2, T>(-A[1][0], A[0][0]));
  261. }
  262. // inverse of a 3x3 matrix
  263. template<typename T>
  264. inline matMxN<3, 3, T> inverse(const matMxN<3, 3, T>& A)
  265. {
  266. T d = T(1) / determinant(A);
  267. return d * matMxN<3, 3, T>(
  268. vecN<3, T>(A[1][1] * A[2][2] - A[1][2] * A[2][1],
  269. A[0][2] * A[2][1] - A[0][1] * A[2][2],
  270. A[0][1] * A[1][2] - A[0][2] * A[1][1]),
  271. vecN<3, T>(A[1][2] * A[2][0] - A[1][0] * A[2][2],
  272. A[0][0] * A[2][2] - A[0][2] * A[2][0],
  273. A[0][2] * A[1][0] - A[0][0] * A[1][2]),
  274. vecN<3, T>(A[1][0] * A[2][1] - A[1][1] * A[2][0],
  275. A[0][1] * A[2][0] - A[0][0] * A[2][1],
  276. A[0][0] * A[1][1] - A[0][1] * A[1][0]));
  277. }
  278. // inverse of a 4x4 matrix
  279. template<typename T>
  280. inline matMxN<4, 4, T> inverse(const matMxN<4, 4, T>& A)
  281. {
  282. // Implementation based on glm: http://glm.g-truc.net
  283. T c00 = A[2][2] * A[3][3] - A[3][2] * A[2][3];
  284. T c02 = A[1][2] * A[3][3] - A[3][2] * A[1][3];
  285. T c03 = A[1][2] * A[2][3] - A[2][2] * A[1][3];
  286. T c04 = A[2][1] * A[3][3] - A[3][1] * A[2][3];
  287. T c06 = A[1][1] * A[3][3] - A[3][1] * A[1][3];
  288. T c07 = A[1][1] * A[2][3] - A[2][1] * A[1][3];
  289. T c08 = A[2][1] * A[3][2] - A[3][1] * A[2][2];
  290. T c10 = A[1][1] * A[3][2] - A[3][1] * A[1][2];
  291. T c11 = A[1][1] * A[2][2] - A[2][1] * A[1][2];
  292. T c12 = A[2][0] * A[3][3] - A[3][0] * A[2][3];
  293. T c14 = A[1][0] * A[3][3] - A[3][0] * A[1][3];
  294. T c15 = A[1][0] * A[2][3] - A[2][0] * A[1][3];
  295. T c16 = A[2][0] * A[3][2] - A[3][0] * A[2][2];
  296. T c18 = A[1][0] * A[3][2] - A[3][0] * A[1][2];
  297. T c19 = A[1][0] * A[2][2] - A[2][0] * A[1][2];
  298. T c20 = A[2][0] * A[3][1] - A[3][0] * A[2][1];
  299. T c22 = A[1][0] * A[3][1] - A[3][0] * A[1][1];
  300. T c23 = A[1][0] * A[2][1] - A[2][0] * A[1][1];
  301. vecN<4, T> f0(c00, c00, c02, c03);
  302. vecN<4, T> f1(c04, c04, c06, c07);
  303. vecN<4, T> f2(c08, c08, c10, c11);
  304. vecN<4, T> f3(c12, c12, c14, c15);
  305. vecN<4, T> f4(c16, c16, c18, c19);
  306. vecN<4, T> f5(c20, c20, c22, c23);
  307. vecN<4, T> v0(A[1][0], A[0][0], A[0][0], A[0][0]);
  308. vecN<4, T> v1(A[1][1], A[0][1], A[0][1], A[0][1]);
  309. vecN<4, T> v2(A[1][2], A[0][2], A[0][2], A[0][2]);
  310. vecN<4, T> v3(A[1][3], A[0][3], A[0][3], A[0][3]);
  311. vecN<4, T> i0(v1 * f0 - v2 * f1 + v3 * f2);
  312. vecN<4, T> i1(v0 * f0 - v2 * f3 + v3 * f4);
  313. vecN<4, T> i2(v0 * f1 - v1 * f3 + v3 * f5);
  314. vecN<4, T> i3(v0 * f2 - v1 * f4 + v2 * f5);
  315. vecN<4, T> sa(+1, -1, +1, -1);
  316. vecN<4, T> sb(-1, +1, -1, +1);
  317. matMxN<4, 4, T> inverse(i0 * sa, i1 * sb, i2 * sa, i3 * sb);
  318. vecN<4, T> row0(inverse[0][0], inverse[1][0], inverse[2][0], inverse[3][0]);
  319. T OneOverDeterminant = static_cast<T>(1) / dot(A[0], row0);
  320. return inverse * OneOverDeterminant;
  321. }
  322. // norm-2 squared
  323. template<int M, int N, typename T>
  324. inline T norm2_2(const matMxN<M, N, T>& A)
  325. {
  326. vecN<M, T> t;
  327. for (int i = 0; i < N; ++i) t += A[i] * A[i];
  328. return sum(t);
  329. }
  330. // Textual representation. (Only) used to debug unittest.
  331. template<int M, int N, typename T>
  332. std::ostream& operator<<(std::ostream& os, const matMxN<M, N, T>& A)
  333. {
  334. for (int j = 0; j < M; ++j) {
  335. for (int i = 0; i < N; ++i) {
  336. os << A[i][j] << ' ';
  337. }
  338. os << '\n';
  339. }
  340. return os;
  341. }
  342. } // namespace gl
  343. // --- SSE optimizations ---
  344. #ifdef __SSE__
  345. #include <xmmintrin.h>
  346. namespace gl {
  347. // transpose of 4x4-float matrix
  348. inline mat4 transpose(const mat4& A)
  349. {
  350. mat4 R;
  351. __m128 t0 = _mm_shuffle_ps(A[0].sse(), A[1].sse(), _MM_SHUFFLE(1,0,1,0));
  352. __m128 t1 = _mm_shuffle_ps(A[0].sse(), A[1].sse(), _MM_SHUFFLE(3,2,3,2));
  353. __m128 t2 = _mm_shuffle_ps(A[2].sse(), A[3].sse(), _MM_SHUFFLE(1,0,1,0));
  354. __m128 t3 = _mm_shuffle_ps(A[2].sse(), A[3].sse(), _MM_SHUFFLE(3,2,3,2));
  355. R[0] = vec4(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,0,2,0)));
  356. R[1] = vec4(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,1,3,1)));
  357. R[2] = vec4(_mm_shuffle_ps(t1, t3, _MM_SHUFFLE(2,0,2,0)));
  358. R[3] = vec4(_mm_shuffle_ps(t1, t3, _MM_SHUFFLE(3,1,3,1)));
  359. return R;
  360. }
  361. // inverse of a 4x4-float matrix
  362. inline mat4 inverse(const mat4& A)
  363. {
  364. // Implementation based on glm: http://glm.g-truc.net
  365. __m128 sa0 = _mm_shuffle_ps(A[3].sse(), A[2].sse(), _MM_SHUFFLE(3,3,3,3));
  366. __m128 sb0 = _mm_shuffle_ps(A[3].sse(), A[2].sse(), _MM_SHUFFLE(2,2,2,2));
  367. __m128 s00 = _mm_shuffle_ps(A[2].sse(), A[1].sse(), _MM_SHUFFLE(2,2,2,2));
  368. __m128 s10 = _mm_shuffle_ps(sa0, sa0, _MM_SHUFFLE(2,0,0,0));
  369. __m128 s20 = _mm_shuffle_ps(sb0, sb0, _MM_SHUFFLE(2,0,0,0));
  370. __m128 s30 = _mm_shuffle_ps(A[2].sse(), A[1].sse(), _MM_SHUFFLE(3,3,3,3));
  371. __m128 f0 = _mm_sub_ps(_mm_mul_ps(s00, s10), _mm_mul_ps(s20, s30));
  372. __m128 sa1 = _mm_shuffle_ps(A[3].sse(), A[2].sse(), _MM_SHUFFLE(3,3,3,3));
  373. __m128 sb1 = _mm_shuffle_ps(A[3].sse(), A[2].sse(), _MM_SHUFFLE(1,1,1,1));
  374. __m128 s01 = _mm_shuffle_ps(A[2].sse(), A[1].sse(), _MM_SHUFFLE(1,1,1,1));
  375. __m128 s11 = _mm_shuffle_ps(sa1, sa1, _MM_SHUFFLE(2,0,0,0));
  376. __m128 s21 = _mm_shuffle_ps(sb1, sb1, _MM_SHUFFLE(2,0,0,0));
  377. __m128 s31 = _mm_shuffle_ps(A[2].sse(), A[1].sse(), _MM_SHUFFLE(3,3,3,3));
  378. __m128 f1 = _mm_sub_ps(_mm_mul_ps(s01, s11), _mm_mul_ps(s21, s31));
  379. __m128 sa2 = _mm_shuffle_ps(A[3].sse(), A[2].sse(), _MM_SHUFFLE(2,2,2,2));
  380. __m128 sb2 = _mm_shuffle_ps(A[3].sse(), A[2].sse(), _MM_SHUFFLE(1,1,1,1));
  381. __m128 s02 = _mm_shuffle_ps(A[2].sse(), A[1].sse(), _MM_SHUFFLE(1,1,1,1));
  382. __m128 s12 = _mm_shuffle_ps(sa2, sa2, _MM_SHUFFLE(2,0,0,0));
  383. __m128 s22 = _mm_shuffle_ps(sb2, sb2, _MM_SHUFFLE(2,0,0,0));
  384. __m128 s32 = _mm_shuffle_ps(A[2].sse(), A[1].sse(), _MM_SHUFFLE(2,2,2,2));
  385. __m128 f2 = _mm_sub_ps(_mm_mul_ps(s02, s12), _mm_mul_ps(s22, s32));
  386. __m128 sa3 = _mm_shuffle_ps(A[3].sse(), A[2].sse(), _MM_SHUFFLE(3,3,3,3));
  387. __m128 sb3 = _mm_shuffle_ps(A[3].sse(), A[2].sse(), _MM_SHUFFLE(0,0,0,0));
  388. __m128 s03 = _mm_shuffle_ps(A[2].sse(), A[1].sse(), _MM_SHUFFLE(0,0,0,0));
  389. __m128 s13 = _mm_shuffle_ps(sa3, sa3, _MM_SHUFFLE(2,0,0,0));
  390. __m128 s23 = _mm_shuffle_ps(sb3, sb3, _MM_SHUFFLE(2,0,0,0));
  391. __m128 s33 = _mm_shuffle_ps(A[2].sse(), A[1].sse(), _MM_SHUFFLE(3,3,3,3));
  392. __m128 f3 = _mm_sub_ps(_mm_mul_ps(s03, s13), _mm_mul_ps(s23, s33));
  393. __m128 sa4 = _mm_shuffle_ps(A[3].sse(), A[2].sse(), _MM_SHUFFLE(2,2,2,2));
  394. __m128 sb4 = _mm_shuffle_ps(A[3].sse(), A[2].sse(), _MM_SHUFFLE(0,0,0,0));
  395. __m128 s04 = _mm_shuffle_ps(A[2].sse(), A[1].sse(), _MM_SHUFFLE(0,0,0,0));
  396. __m128 s14 = _mm_shuffle_ps(sa4, sa4, _MM_SHUFFLE(2,0,0,0));
  397. __m128 s24 = _mm_shuffle_ps(sb4, sb4, _MM_SHUFFLE(2,0,0,0));
  398. __m128 s34 = _mm_shuffle_ps(A[2].sse(), A[1].sse(), _MM_SHUFFLE(2,2,2,2));
  399. __m128 f4 = _mm_sub_ps(_mm_mul_ps(s04, s14), _mm_mul_ps(s24, s34));
  400. __m128 sa5 = _mm_shuffle_ps(A[3].sse(), A[2].sse(), _MM_SHUFFLE(1,1,1,1));
  401. __m128 sb5 = _mm_shuffle_ps(A[3].sse(), A[2].sse(), _MM_SHUFFLE(0,0,0,0));
  402. __m128 s05 = _mm_shuffle_ps(A[2].sse(), A[1].sse(), _MM_SHUFFLE(0,0,0,0));
  403. __m128 s15 = _mm_shuffle_ps(sa5, sa5, _MM_SHUFFLE(2,0,0,0));
  404. __m128 s25 = _mm_shuffle_ps(sb5, sb5, _MM_SHUFFLE(2,0,0,0));
  405. __m128 s35 = _mm_shuffle_ps(A[2].sse(), A[1].sse(), _MM_SHUFFLE(1,1,1,1));
  406. __m128 f5 = _mm_sub_ps(_mm_mul_ps(s05, s15), _mm_mul_ps(s25, s35));
  407. __m128 t0 = _mm_shuffle_ps(A[1].sse(), A[0].sse(), _MM_SHUFFLE(0,0,0,0));
  408. __m128 t1 = _mm_shuffle_ps(A[1].sse(), A[0].sse(), _MM_SHUFFLE(1,1,1,1));
  409. __m128 t2 = _mm_shuffle_ps(A[1].sse(), A[0].sse(), _MM_SHUFFLE(2,2,2,2));
  410. __m128 t3 = _mm_shuffle_ps(A[1].sse(), A[0].sse(), _MM_SHUFFLE(3,3,3,3));
  411. __m128 v0 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2,2,2,0));
  412. __m128 v1 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,2,2,0));
  413. __m128 v2 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,2,2,0));
  414. __m128 v3 = _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,2,0));
  415. __m128 s0 = _mm_sub_ps(_mm_mul_ps(v1, f0), _mm_mul_ps(v2, f1));
  416. __m128 s1 = _mm_sub_ps(_mm_mul_ps(v0, f0), _mm_mul_ps(v2, f3));
  417. __m128 s2 = _mm_sub_ps(_mm_mul_ps(v0, f1), _mm_mul_ps(v1, f3));
  418. __m128 s3 = _mm_sub_ps(_mm_mul_ps(v0, f2), _mm_mul_ps(v1, f4));
  419. __m128 sa = _mm_set_ps( 1.0f,-1.0f, 1.0f,-1.0f);
  420. __m128 sb = _mm_set_ps(-1.0f, 1.0f,-1.0f, 1.0f);
  421. __m128 i0 = _mm_mul_ps(sb, _mm_add_ps(s0, _mm_mul_ps(v3, f2)));
  422. __m128 i1 = _mm_mul_ps(sa, _mm_add_ps(s1, _mm_mul_ps(v3, f4)));
  423. __m128 i2 = _mm_mul_ps(sb, _mm_add_ps(s2, _mm_mul_ps(v3, f5)));
  424. __m128 i3 = _mm_mul_ps(sa, _mm_add_ps(s3, _mm_mul_ps(v2, f5)));
  425. __m128 ra = _mm_shuffle_ps(i0, i1, _MM_SHUFFLE(0,0,0,0));
  426. __m128 rb = _mm_shuffle_ps(i2, i3, _MM_SHUFFLE(0,0,0,0));
  427. vec4 row0 = vec4(_mm_shuffle_ps(ra, rb, _MM_SHUFFLE(2,0,2,0)));
  428. vec4 rd = recip(dot_broadcast(A[0], row0));
  429. return mat4(vec4(i0) * rd, vec4(i1) * rd, vec4(i2) * rd, vec4(i3) * rd);
  430. }
  431. } // namespace gl
  432. #endif // __SSE__
  433. #endif