123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507 |
- #ifndef GL_MAT_HH
- #define GL_MAT_HH
- // This code implements a mathematical matrix, comparable in functionality
- // and syntax to the matrix types in GLSL.
- //
- // The code was tuned for 4x4 matrixes, but when it didn't complicate the
- // code, other sizes (even non-square) are supported as well.
- //
- // The functionality of this code is focused on openGL(ES). So only the
- // operation 'matrix x column-vector' is supported and not e.g. 'row-vector x
- // matrix'. The internal layout is also compatible with openGL(ES).
- //
- // This code itself doesn't have many SSE optimizations, but because it is built
- // on top of an optimized vector type the generated code for 4x4 matrix
- // multiplication is as efficient as a hand-written SSE assembly routine
- // (verified with gcc-4.8 on x86_64). The efficiency for other matrix sizes was
- // not a goal for this code (smaller sizes might be ok, bigger sizes most
- // likely not).
- #include "gl_vec.hh"
- #include <cassert>
- namespace gl {
- // Matrix with M rows and N columns (M by N), elements have type T.
- // Internally elements are stored in column-major order. This is the same
- // convention as openGL (and Fortran), but different from the typical row-major
- // order in C++. This allows to directly pass these matrices to openGL(ES).
- template<int M, int N, typename T> class matMxN
- {
- public:
- // Default copy-constructor and assignment operator.
- // Construct identity matrix.
- matMxN()
- {
- for (int i = 0; i < N; ++i) {
- for (int j = 0; j < M; ++j) {
- c[i][j] = (i == j) ? T(1) : T(0);
- }
- }
- }
-
- // Construct diagonal matrix.
- explicit matMxN(const vecN<(M < N ? M : N), T>& d)
- {
- for (int i = 0; i < N; ++i) {
- for (int j = 0; j < M; ++j) {
- c[i][j] = (i == j) ? d[i] : T(0);
- }
- }
- }
- // Construct from larger matrix (higher order elements are dropped).
- template<int M2, int N2> explicit matMxN(const matMxN<M2, N2, T>& x)
- {
- static_assert(((M2 > M) && (N2 >= N)) || ((M2 >= M) && (N2 > N)),
- "matrix must have strictly larger dimensions");
- for (int i = 0; i < N; ++i) c[i] = vecN<M, T>(x[i]);
- }
- // Construct matrix from 2 given columns (only valid when N == 2).
- matMxN(const vecN<M, T>& x, const vecN<M, T>& y)
- {
- static_assert(N == 2, "wrong #constructor arguments");
- c[0] = x; c[1] = y;
- }
- // Construct matrix from 3 given columns (only valid when N == 3).
- matMxN(const vecN<M, T>& x, const vecN<M, T>& y, const vecN<M, T>& z)
- {
- static_assert(N == 3, "wrong #constructor arguments");
- c[0] = x; c[1] = y; c[2] = z;
- }
- // Construct matrix from 4 given columns (only valid when N == 4).
- matMxN(const vecN<M, T>& x, const vecN<M, T>& y,
- const vecN<M, T>& z, const vecN<M, T>& w)
- {
- static_assert(N == 4, "wrong #constructor arguments");
- c[0] = x; c[1] = y; c[2] = z; c[3] = w;
- }
- // Access the i-the column of the matrix.
- // Vectors are also indexable, so 'A[i][j]' returns the element
- // at the i-th column, j-th row.
- const vecN<M, T>& operator[](unsigned i) const {
- #ifdef DEBUG
- assert(i < N);
- #endif
- return c[i];
- }
- vecN<M, T>& operator[](unsigned i) {
- #ifdef DEBUG
- assert(i < N);
- #endif
- return c[i];
- }
- // Assignment version of the +,-,* operations defined below.
- matMxN& operator+=(const matMxN& x) { *this = *this + x; return *this; }
- matMxN& operator-=(const matMxN& x) { *this = *this - x; return *this; }
- matMxN& operator*=(T x) { *this = *this * x; return *this; }
- matMxN& operator*=(const matMxN<N, N, T>& x) { *this = *this * x; return *this; }
- private:
- vecN<M, T> c[N];
- };
- // Specialization for 4x4 matrix.
- // This specialization is not needed for correctness, but it does help gcc-4.8
- // to generate better code for the constructors (clang-3.4 doesn't need help).
- // Maybe remove this specialization in the future?
- template<typename T> class matMxN<4, 4, T>
- {
- public:
- matMxN()
- {
- T one(1); T zero(0);
- c[0] = vecN<4,T>( one, zero, zero, zero);
- c[1] = vecN<4,T>(zero, one, zero, zero);
- c[2] = vecN<4,T>(zero, zero, one, zero);
- c[3] = vecN<4,T>(zero, zero, zero, one);
- }
- explicit matMxN(vecN<4, T> d)
- {
- T zero(0);
- c[0] = vecN<4,T>(d[0], zero, zero, zero);
- c[1] = vecN<4,T>(zero, d[1], zero, zero);
- c[2] = vecN<4,T>(zero, zero, d[2], zero);
- c[3] = vecN<4,T>(zero, zero, zero, d[3]);
- }
- matMxN(vecN<4, T> x, vecN<4, T> y, vecN<4, T> z, vecN<4, T> w)
- {
- c[0] = x; c[1] = y; c[2] = z; c[3] = w;
- }
- vecN<4, T> operator[](int i) const { return c[i]; }
- vecN<4, T>& operator[](int i) { return c[i]; }
- // Assignment version of the +,-,* operations defined below.
- matMxN& operator+=(const matMxN& x) { *this = *this + x; return *this; }
- matMxN& operator-=(const matMxN& x) { *this = *this - x; return *this; }
- matMxN& operator*=(T x) { *this = *this * x; return *this; }
- matMxN& operator*=(const matMxN& x) { *this = *this * x; return *this; }
- private:
- vecN<4, T> c[4];
- };
- // Convenience typedefs (same names as used by GLSL).
- using mat2 = matMxN<2, 2, float>;
- using mat3 = matMxN<3, 3, float>;
- using mat4 = matMxN<4, 4, float>;
- // -- Matrix functions --
- // matrix equality / inequality
- template<int M, int N, typename T>
- inline bool operator==(const matMxN<M, N, T>& A, const matMxN<M, N, T>& B)
- {
- for (int i = 0; i < N; ++i) if (A[i] != B[i]) return false;
- return true;
- }
- template<int M, int N, typename T>
- inline bool operator!=(const matMxN<M, N, T>& A, const matMxN<M, N, T>& B)
- {
- return !(A == B);
- }
- // matrix + matrix
- template<int M, int N, typename T>
- inline matMxN<M, N, T> operator+(const matMxN<M, N, T>& A, const matMxN<M, N, T>& B)
- {
- matMxN<M, N, T> R;
- for (int i = 0; i < N; ++i) R[i] = A[i] + B[i];
- return R;
- }
- // matrix - matrix
- template<int M, int N, typename T>
- inline matMxN<M, N, T> operator-(const matMxN<M, N, T>& A, const matMxN<M, N, T>& B)
- {
- matMxN<M, N, T> R;
- for (int i = 0; i < N; ++i) R[i] = A[i] - B[i];
- return R;
- }
- // matrix negation
- template<int M, int N, typename T>
- inline matMxN<M, N, T> operator-(const matMxN<M, N, T>& A)
- {
- return matMxN<M, N, T>(vecN<(M < N ? M : N), T>()) - A;
- }
- // scalar * matrix
- template<int M, int N, typename T>
- inline matMxN<M, N, T> operator*(T x, const matMxN<M, N, T>& A)
- {
- matMxN<M, N, T> R;
- for (int i = 0; i < N; ++i) R[i] = x * A[i];
- return R;
- }
- // matrix * scalar
- template<int M, int N, typename T>
- inline matMxN<M, N, T> operator*(const matMxN<M, N, T>& A, T x)
- {
- matMxN<M, N, T> R;
- for (int i = 0; i < N; ++i) R[i] = A[i] * x;
- return R;
- }
- // matrix * column-vector
- template<int M, int N, typename T>
- inline vecN<M, T> operator*(const matMxN<M, N, T>& A, const vecN<N, T>& x)
- {
- vecN<M, T> r;
- for (int i = 0; i < N; ++i) r += A[i] * x[i];
- return r;
- }
- template<typename T>
- inline vecN<4, T> operator*(const matMxN<4, 4, T>& A, vecN<4, T> x)
- {
- // Small optimization for 4x4 matrix: reassociate add-chain
- return (A[0] * x[0] + A[1] * x[1]) + (A[2] * x[2] + A[3] * x[3]);
- }
- // matrix * matrix
- template<int M, int N, int O, typename T>
- inline matMxN<M, O, T> operator*(const matMxN<M, N, T>& A, const matMxN<N, O, T>& B)
- {
- matMxN<M, O, T> R;
- for (int i = 0; i < O; ++i) R[i] = A * B[i];
- return R;
- }
- // matrix transpose
- template<int M, int N, typename T>
- inline matMxN<N, M, T> transpose(const matMxN<M, N, T>& A)
- {
- matMxN<N, M, T> R;
- for (int i = 0; i < N; ++i) {
- for (int j = 0; j < M; ++j) {
- R[j][i] = A[i][j];
- }
- }
- return R;
- }
- // determinant of a 2x2 matrix
- template<typename T>
- inline T determinant(const matMxN<2, 2, T>& A)
- {
- return A[0][0] * A[1][1] - A[0][1] * A[1][0];
- }
- // determinant of a 3x3 matrix
- template<typename T>
- inline T determinant(const matMxN<3, 3, T>& A)
- {
- return A[0][0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1])
- - A[1][0] * (A[0][1] * A[2][2] - A[0][2] * A[2][1])
- + A[2][0] * (A[0][1] * A[1][2] - A[0][2] * A[1][1]);
- }
- // determinant of a 4x4 matrix
- template<typename T>
- inline T determinant(const matMxN<4, 4, T>& A)
- {
- // Implementation based on glm: http://glm.g-truc.net
- T f0 = A[2][2] * A[3][3] - A[3][2] * A[2][3];
- T f1 = A[2][1] * A[3][3] - A[3][1] * A[2][3];
- T f2 = A[2][1] * A[3][2] - A[3][1] * A[2][2];
- T f3 = A[2][0] * A[3][3] - A[3][0] * A[2][3];
- T f4 = A[2][0] * A[3][2] - A[3][0] * A[2][2];
- T f5 = A[2][0] * A[3][1] - A[3][0] * A[2][1];
- vecN<4, T> c((A[1][1] * f0 - A[1][2] * f1 + A[1][3] * f2),
- (A[1][2] * f3 - A[1][3] * f4 - A[1][0] * f0),
- (A[1][0] * f1 - A[1][1] * f3 + A[1][3] * f5),
- (A[1][1] * f4 - A[1][2] * f5 - A[1][0] * f2));
- return dot(A[0], c);
- }
- // inverse of a 2x2 matrix
- template<typename T>
- inline matMxN<2, 2, T> inverse(const matMxN<2, 2, T>& A)
- {
- T d = T(1) / determinant(A);
- return d * matMxN<2, 2, T>(vecN<2, T>( A[1][1], -A[0][1]),
- vecN<2, T>(-A[1][0], A[0][0]));
- }
- // inverse of a 3x3 matrix
- template<typename T>
- inline matMxN<3, 3, T> inverse(const matMxN<3, 3, T>& A)
- {
- T d = T(1) / determinant(A);
- return d * matMxN<3, 3, T>(
- vecN<3, T>(A[1][1] * A[2][2] - A[1][2] * A[2][1],
- A[0][2] * A[2][1] - A[0][1] * A[2][2],
- A[0][1] * A[1][2] - A[0][2] * A[1][1]),
- vecN<3, T>(A[1][2] * A[2][0] - A[1][0] * A[2][2],
- A[0][0] * A[2][2] - A[0][2] * A[2][0],
- A[0][2] * A[1][0] - A[0][0] * A[1][2]),
- vecN<3, T>(A[1][0] * A[2][1] - A[1][1] * A[2][0],
- A[0][1] * A[2][0] - A[0][0] * A[2][1],
- A[0][0] * A[1][1] - A[0][1] * A[1][0]));
- }
- // inverse of a 4x4 matrix
- template<typename T>
- inline matMxN<4, 4, T> inverse(const matMxN<4, 4, T>& A)
- {
- // Implementation based on glm: http://glm.g-truc.net
- T c00 = A[2][2] * A[3][3] - A[3][2] * A[2][3];
- T c02 = A[1][2] * A[3][3] - A[3][2] * A[1][3];
- T c03 = A[1][2] * A[2][3] - A[2][2] * A[1][3];
- T c04 = A[2][1] * A[3][3] - A[3][1] * A[2][3];
- T c06 = A[1][1] * A[3][3] - A[3][1] * A[1][3];
- T c07 = A[1][1] * A[2][3] - A[2][1] * A[1][3];
- T c08 = A[2][1] * A[3][2] - A[3][1] * A[2][2];
- T c10 = A[1][1] * A[3][2] - A[3][1] * A[1][2];
- T c11 = A[1][1] * A[2][2] - A[2][1] * A[1][2];
- T c12 = A[2][0] * A[3][3] - A[3][0] * A[2][3];
- T c14 = A[1][0] * A[3][3] - A[3][0] * A[1][3];
- T c15 = A[1][0] * A[2][3] - A[2][0] * A[1][3];
- T c16 = A[2][0] * A[3][2] - A[3][0] * A[2][2];
- T c18 = A[1][0] * A[3][2] - A[3][0] * A[1][2];
- T c19 = A[1][0] * A[2][2] - A[2][0] * A[1][2];
- T c20 = A[2][0] * A[3][1] - A[3][0] * A[2][1];
- T c22 = A[1][0] * A[3][1] - A[3][0] * A[1][1];
- T c23 = A[1][0] * A[2][1] - A[2][0] * A[1][1];
- vecN<4, T> f0(c00, c00, c02, c03);
- vecN<4, T> f1(c04, c04, c06, c07);
- vecN<4, T> f2(c08, c08, c10, c11);
- vecN<4, T> f3(c12, c12, c14, c15);
- vecN<4, T> f4(c16, c16, c18, c19);
- vecN<4, T> f5(c20, c20, c22, c23);
- vecN<4, T> v0(A[1][0], A[0][0], A[0][0], A[0][0]);
- vecN<4, T> v1(A[1][1], A[0][1], A[0][1], A[0][1]);
- vecN<4, T> v2(A[1][2], A[0][2], A[0][2], A[0][2]);
- vecN<4, T> v3(A[1][3], A[0][3], A[0][3], A[0][3]);
- vecN<4, T> i0(v1 * f0 - v2 * f1 + v3 * f2);
- vecN<4, T> i1(v0 * f0 - v2 * f3 + v3 * f4);
- vecN<4, T> i2(v0 * f1 - v1 * f3 + v3 * f5);
- vecN<4, T> i3(v0 * f2 - v1 * f4 + v2 * f5);
- vecN<4, T> sa(+1, -1, +1, -1);
- vecN<4, T> sb(-1, +1, -1, +1);
- matMxN<4, 4, T> inverse(i0 * sa, i1 * sb, i2 * sa, i3 * sb);
- vecN<4, T> row0(inverse[0][0], inverse[1][0], inverse[2][0], inverse[3][0]);
- T OneOverDeterminant = static_cast<T>(1) / dot(A[0], row0);
- return inverse * OneOverDeterminant;
- }
- // norm-2 squared
- template<int M, int N, typename T>
- inline T norm2_2(const matMxN<M, N, T>& A)
- {
- vecN<M, T> t;
- for (int i = 0; i < N; ++i) t += A[i] * A[i];
- return sum(t);
- }
- // Textual representation. (Only) used to debug unittest.
- template<int M, int N, typename T>
- std::ostream& operator<<(std::ostream& os, const matMxN<M, N, T>& A)
- {
- for (int j = 0; j < M; ++j) {
- for (int i = 0; i < N; ++i) {
- os << A[i][j] << ' ';
- }
- os << '\n';
- }
- return os;
- }
- } // namespace gl
- // --- SSE optimizations ---
- #ifdef __SSE__
- #include <xmmintrin.h>
- namespace gl {
- // transpose of 4x4-float matrix
- inline mat4 transpose(const mat4& A)
- {
- mat4 R;
- __m128 t0 = _mm_shuffle_ps(A[0].sse(), A[1].sse(), _MM_SHUFFLE(1,0,1,0));
- __m128 t1 = _mm_shuffle_ps(A[0].sse(), A[1].sse(), _MM_SHUFFLE(3,2,3,2));
- __m128 t2 = _mm_shuffle_ps(A[2].sse(), A[3].sse(), _MM_SHUFFLE(1,0,1,0));
- __m128 t3 = _mm_shuffle_ps(A[2].sse(), A[3].sse(), _MM_SHUFFLE(3,2,3,2));
- R[0] = vec4(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,0,2,0)));
- R[1] = vec4(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,1,3,1)));
- R[2] = vec4(_mm_shuffle_ps(t1, t3, _MM_SHUFFLE(2,0,2,0)));
- R[3] = vec4(_mm_shuffle_ps(t1, t3, _MM_SHUFFLE(3,1,3,1)));
- return R;
- }
- // inverse of a 4x4-float matrix
- inline mat4 inverse(const mat4& A)
- {
- // Implementation based on glm: http://glm.g-truc.net
- __m128 sa0 = _mm_shuffle_ps(A[3].sse(), A[2].sse(), _MM_SHUFFLE(3,3,3,3));
- __m128 sb0 = _mm_shuffle_ps(A[3].sse(), A[2].sse(), _MM_SHUFFLE(2,2,2,2));
- __m128 s00 = _mm_shuffle_ps(A[2].sse(), A[1].sse(), _MM_SHUFFLE(2,2,2,2));
- __m128 s10 = _mm_shuffle_ps(sa0, sa0, _MM_SHUFFLE(2,0,0,0));
- __m128 s20 = _mm_shuffle_ps(sb0, sb0, _MM_SHUFFLE(2,0,0,0));
- __m128 s30 = _mm_shuffle_ps(A[2].sse(), A[1].sse(), _MM_SHUFFLE(3,3,3,3));
- __m128 f0 = _mm_sub_ps(_mm_mul_ps(s00, s10), _mm_mul_ps(s20, s30));
- __m128 sa1 = _mm_shuffle_ps(A[3].sse(), A[2].sse(), _MM_SHUFFLE(3,3,3,3));
- __m128 sb1 = _mm_shuffle_ps(A[3].sse(), A[2].sse(), _MM_SHUFFLE(1,1,1,1));
- __m128 s01 = _mm_shuffle_ps(A[2].sse(), A[1].sse(), _MM_SHUFFLE(1,1,1,1));
- __m128 s11 = _mm_shuffle_ps(sa1, sa1, _MM_SHUFFLE(2,0,0,0));
- __m128 s21 = _mm_shuffle_ps(sb1, sb1, _MM_SHUFFLE(2,0,0,0));
- __m128 s31 = _mm_shuffle_ps(A[2].sse(), A[1].sse(), _MM_SHUFFLE(3,3,3,3));
- __m128 f1 = _mm_sub_ps(_mm_mul_ps(s01, s11), _mm_mul_ps(s21, s31));
- __m128 sa2 = _mm_shuffle_ps(A[3].sse(), A[2].sse(), _MM_SHUFFLE(2,2,2,2));
- __m128 sb2 = _mm_shuffle_ps(A[3].sse(), A[2].sse(), _MM_SHUFFLE(1,1,1,1));
- __m128 s02 = _mm_shuffle_ps(A[2].sse(), A[1].sse(), _MM_SHUFFLE(1,1,1,1));
- __m128 s12 = _mm_shuffle_ps(sa2, sa2, _MM_SHUFFLE(2,0,0,0));
- __m128 s22 = _mm_shuffle_ps(sb2, sb2, _MM_SHUFFLE(2,0,0,0));
- __m128 s32 = _mm_shuffle_ps(A[2].sse(), A[1].sse(), _MM_SHUFFLE(2,2,2,2));
- __m128 f2 = _mm_sub_ps(_mm_mul_ps(s02, s12), _mm_mul_ps(s22, s32));
- __m128 sa3 = _mm_shuffle_ps(A[3].sse(), A[2].sse(), _MM_SHUFFLE(3,3,3,3));
- __m128 sb3 = _mm_shuffle_ps(A[3].sse(), A[2].sse(), _MM_SHUFFLE(0,0,0,0));
- __m128 s03 = _mm_shuffle_ps(A[2].sse(), A[1].sse(), _MM_SHUFFLE(0,0,0,0));
- __m128 s13 = _mm_shuffle_ps(sa3, sa3, _MM_SHUFFLE(2,0,0,0));
- __m128 s23 = _mm_shuffle_ps(sb3, sb3, _MM_SHUFFLE(2,0,0,0));
- __m128 s33 = _mm_shuffle_ps(A[2].sse(), A[1].sse(), _MM_SHUFFLE(3,3,3,3));
- __m128 f3 = _mm_sub_ps(_mm_mul_ps(s03, s13), _mm_mul_ps(s23, s33));
- __m128 sa4 = _mm_shuffle_ps(A[3].sse(), A[2].sse(), _MM_SHUFFLE(2,2,2,2));
- __m128 sb4 = _mm_shuffle_ps(A[3].sse(), A[2].sse(), _MM_SHUFFLE(0,0,0,0));
- __m128 s04 = _mm_shuffle_ps(A[2].sse(), A[1].sse(), _MM_SHUFFLE(0,0,0,0));
- __m128 s14 = _mm_shuffle_ps(sa4, sa4, _MM_SHUFFLE(2,0,0,0));
- __m128 s24 = _mm_shuffle_ps(sb4, sb4, _MM_SHUFFLE(2,0,0,0));
- __m128 s34 = _mm_shuffle_ps(A[2].sse(), A[1].sse(), _MM_SHUFFLE(2,2,2,2));
- __m128 f4 = _mm_sub_ps(_mm_mul_ps(s04, s14), _mm_mul_ps(s24, s34));
- __m128 sa5 = _mm_shuffle_ps(A[3].sse(), A[2].sse(), _MM_SHUFFLE(1,1,1,1));
- __m128 sb5 = _mm_shuffle_ps(A[3].sse(), A[2].sse(), _MM_SHUFFLE(0,0,0,0));
- __m128 s05 = _mm_shuffle_ps(A[2].sse(), A[1].sse(), _MM_SHUFFLE(0,0,0,0));
- __m128 s15 = _mm_shuffle_ps(sa5, sa5, _MM_SHUFFLE(2,0,0,0));
- __m128 s25 = _mm_shuffle_ps(sb5, sb5, _MM_SHUFFLE(2,0,0,0));
- __m128 s35 = _mm_shuffle_ps(A[2].sse(), A[1].sse(), _MM_SHUFFLE(1,1,1,1));
- __m128 f5 = _mm_sub_ps(_mm_mul_ps(s05, s15), _mm_mul_ps(s25, s35));
- __m128 t0 = _mm_shuffle_ps(A[1].sse(), A[0].sse(), _MM_SHUFFLE(0,0,0,0));
- __m128 t1 = _mm_shuffle_ps(A[1].sse(), A[0].sse(), _MM_SHUFFLE(1,1,1,1));
- __m128 t2 = _mm_shuffle_ps(A[1].sse(), A[0].sse(), _MM_SHUFFLE(2,2,2,2));
- __m128 t3 = _mm_shuffle_ps(A[1].sse(), A[0].sse(), _MM_SHUFFLE(3,3,3,3));
- __m128 v0 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2,2,2,0));
- __m128 v1 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,2,2,0));
- __m128 v2 = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,2,2,0));
- __m128 v3 = _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,2,0));
- __m128 s0 = _mm_sub_ps(_mm_mul_ps(v1, f0), _mm_mul_ps(v2, f1));
- __m128 s1 = _mm_sub_ps(_mm_mul_ps(v0, f0), _mm_mul_ps(v2, f3));
- __m128 s2 = _mm_sub_ps(_mm_mul_ps(v0, f1), _mm_mul_ps(v1, f3));
- __m128 s3 = _mm_sub_ps(_mm_mul_ps(v0, f2), _mm_mul_ps(v1, f4));
- __m128 sa = _mm_set_ps( 1.0f,-1.0f, 1.0f,-1.0f);
- __m128 sb = _mm_set_ps(-1.0f, 1.0f,-1.0f, 1.0f);
- __m128 i0 = _mm_mul_ps(sb, _mm_add_ps(s0, _mm_mul_ps(v3, f2)));
- __m128 i1 = _mm_mul_ps(sa, _mm_add_ps(s1, _mm_mul_ps(v3, f4)));
- __m128 i2 = _mm_mul_ps(sb, _mm_add_ps(s2, _mm_mul_ps(v3, f5)));
- __m128 i3 = _mm_mul_ps(sa, _mm_add_ps(s3, _mm_mul_ps(v2, f5)));
- __m128 ra = _mm_shuffle_ps(i0, i1, _MM_SHUFFLE(0,0,0,0));
- __m128 rb = _mm_shuffle_ps(i2, i3, _MM_SHUFFLE(0,0,0,0));
- vec4 row0 = vec4(_mm_shuffle_ps(ra, rb, _MM_SHUFFLE(2,0,2,0)));
- vec4 rd = recip(dot_broadcast(A[0], row0));
- return mat4(vec4(i0) * rd, vec4(i1) * rd, vec4(i2) * rd, vec4(i3) * rd);
- }
- } // namespace gl
- #endif // __SSE__
- #endif
|