void FoxLi( Matrix<Complex<Real>>& A, Int n, Real omega ) { DEBUG_CSE typedef Complex<Real> C; const Real pi = 4*Atan( Real(1) ); const C phi = Sqrt( C(0,omega/pi) ); // Compute Gauss quadrature points and weights Matrix<Real> d, e; Zeros( d, n, 1 ); e.Resize( n-1, 1 ); for( Int j=0; j<n-1; ++j ) { const Real betaInv = 2*Sqrt(1-Pow(j+Real(1),-2)/4); e(j) = 1/betaInv; } Matrix<Real> x, Z; HermitianTridiagEig( d, e, x, Z, UNSORTED ); auto z = Z( IR(0), ALL ); Matrix<Real> sqrtWeights( z ), sqrtWeightsTrans; for( Int j=0; j<n; ++j ) sqrtWeights(0,j) = Sqrt(Real(2))*Abs(sqrtWeights(0,j)); herm_eig::Sort( x, sqrtWeights, ASCENDING ); Transpose( sqrtWeights, sqrtWeightsTrans ); // Form the integral operator A.Resize( n, n ); for( Int j=0; j<n; ++j ) { for( Int i=0; i<n; ++i ) { const Real theta = -omega*Pow(x(i)-x(j),2); const Real realPart = Cos(theta); const Real imagPart = Sin(theta); A(i,j) = phi*C(realPart,imagPart); } } // Apply the weighting DiagonalScale( LEFT, NORMAL, sqrtWeightsTrans, A ); DiagonalScale( RIGHT, NORMAL, sqrtWeightsTrans, A ); }
// Rotation kring godtycklig axel (enbart rotationen) void ArbRotate(Point3D *axis, GLfloat fi, GLfloat *m) { Point3D x, y, z, a; GLfloat R[16], Rt[16], Raxel[16], RtRx[16]; // Kolla ocksŒ om parallell med Z-axel! if (axis->x < 0.0000001) // Under nŒgon tillrŠckligt liten grŠns if (axis->x > -0.0000001) if (axis->y < 0.0000001) if (axis->y > -0.0000001) if (axis->z > 0) { Rz(fi, m); return; } else { Rz(-fi, m); return; } x = *axis; Normalize(&x); // |x| SetVector(0,0,1, &z); // Temp z CrossProduct(&z, &x, &y); Normalize(&y); // y' = z^ x x' CrossProduct(&x, &y, &z); // z' = x x y R[0] = x.x; R[4] = x.y; R[8] = x.z; R[12] = 0.0; R[1] = y.x; R[5] = y.y; R[9] = y.z; R[13] = 0.0; R[2] = z.x; R[6] = z.y; R[10] = z.z; R[14] = 0.0; R[3] = 0.0; R[7] = 0.0; R[11] = 0.0; R[15] = 1.0; Transpose(&R, &Rt); // Transpose = Invert -> felet ej i Transpose, och det Šr en ortonormal matris Rx(fi, &Raxel); // Rotate around x axis // m := Rt * Rx * R Mult(&Rt, &Raxel, &RtRx); Mult(&RtRx, &R, m); }
void SUMMA_NTA ( Orientation orientB, T alpha, const AbstractDistMatrix<T>& APre, const AbstractDistMatrix<T>& BPre, AbstractDistMatrix<T>& CPre ) { EL_DEBUG_CSE const Int n = CPre.Width(); const Int bsize = Blocksize(); const Grid& g = APre.Grid(); const bool conjugate = ( orientB == ADJOINT ); DistMatrixReadProxy<T,T,MC,MR> AProx( APre ); DistMatrixReadProxy<T,T,MC,MR> BProx( BPre ); DistMatrixReadWriteProxy<T,T,MC,MR> CProx( CPre ); auto& A = AProx.GetLocked(); auto& B = BProx.GetLocked(); auto& C = CProx.Get(); // Temporary distributions DistMatrix<T,MR,STAR> B1Trans_MR_STAR(g); DistMatrix<T,MC,STAR> D1_MC_STAR(g); B1Trans_MR_STAR.AlignWith( A ); D1_MC_STAR.AlignWith( A ); for( Int k=0; k<n; k+=bsize ) { const Int nb = Min(bsize,n-k); auto B1 = B( IR(k,k+nb), ALL ); auto C1 = C( ALL, IR(k,k+nb) ); // C1[MC,*] := alpha A[MC,MR] (B1^[T/H])[MR,*] Transpose( B1, B1Trans_MR_STAR, conjugate ); LocalGemm( NORMAL, NORMAL, alpha, A, B1Trans_MR_STAR, D1_MC_STAR ); // C1[MC,MR] += scattered result of D1[MC,*] summed over grid rows AxpyContract( T(1), D1_MC_STAR, C1 ); } }
// AnimatedTransform Method Definitions void AnimatedTransform::Decompose(const Matrix4x4 &m, Vector *T, Quaternion *Rquat, Matrix4x4 *S) { // Extract translation _T_ from transformation matrix T->x = m.m[0][3]; T->y = m.m[1][3]; T->z = m.m[2][3]; // Compute new transformation matrix _M_ without translation Matrix4x4 M = m; for (int i = 0; i < 3; ++i) M.m[i][3] = M.m[3][i] = 0.f; M.m[3][3] = 1.f; // Extract rotation _R_ from transformation matrix float norm; int count = 0; Matrix4x4 R = M; do { // Compute next matrix _Rnext_ in series Matrix4x4 Rnext; Matrix4x4 Rit = Inverse(Transpose(R)); for (int i = 0; i < 4; ++i) for (int j = 0; j < 4; ++j) Rnext.m[i][j] = 0.5f * (R.m[i][j] + Rit.m[i][j]); // Compute norm of difference between _R_ and _Rnext_ norm = 0.f; for (int i = 0; i < 3; ++i) { float n = fabsf(R.m[i][0] - Rnext.m[i][0]) + fabsf(R.m[i][1] - Rnext.m[i][1]) + fabsf(R.m[i][2] - Rnext.m[i][2]); norm = max(norm, n); } R = Rnext; } while (++count < 100 && norm > .0001f); // XXX TODO FIXME deal with flip... *Rquat = Quaternion(R); // Compute scale _S_ using rotation and original matrix *S = Matrix4x4::Mul(Inverse(R), M); }
int CalcSphereCenter (const Point<3> ** pts, Point<3> & c) { Vec3d row1 (*pts[0], *pts[1]); Vec3d row2 (*pts[0], *pts[2]); Vec3d row3 (*pts[0], *pts[3]); Vec3d rhs(0.5 * (row1*row1), 0.5 * (row2*row2), 0.5 * (row3*row3)); Transpose (row1, row2, row3); Vec3d sol; if (SolveLinearSystem (row1, row2, row3, rhs, sol)) { (*testout) << "CalcSphereCenter: degenerated" << endl; return 1; } c = *pts[0] + sol; return 0; }
Matrix Matrix::Inverse() const { if (Determinant() == 0) { return Matrix( Vector3D(0, 0, 0), Vector3D(0, 0, 0), Vector3D(0, 0, 0)); } else { Matrix matrix = Transpose(); Float blah = Determinant(); Float invDet = 1.0/Determinant(); return Matrix( matrix.Rows[1].CrossProduct(matrix.Rows[2]) * invDet, matrix.Rows[2].CrossProduct(matrix.Rows[0]) * invDet, matrix.Rows[0].CrossProduct(matrix.Rows[1]) * invDet); } }
void LapackInvAndDet(cDMatrix& theMatrix, cDMatrix& theInvMatrix, double& theDet) { uint myNCol = theMatrix.GetNCols() ; double *myAP = new double[myNCol*(myNCol + 1)/2], *myW = new double[myNCol], *myZ = new double[myNCol*myNCol], *myWork = new double[myNCol * 3] ; int myInfo, myN = (int)(myNCol), myldz = (int)(myNCol) ; for (register int i = 0 ; i < myN ; i++) for (register int j = i ; j < myldz ; j++) myAP[i+(j+1)*j/2] = theMatrix[i][j] ; F77_NAME(dspev)("V", "U", &myN, myAP, myW, myZ, &myldz, myWork, &myInfo) ; if (myInfo != 0) throw cOTError("Non inversible matrix") ; theDet = 1.0L ; cDVector myInvEigenValue = cDVector(myNCol) ; cDMatrix myEigenVector(myNCol, myNCol) ; for (register uint i = 0 ; i < myNCol ; i++) { theDet *= myW[i] ; myInvEigenValue[i] = 1.0 /myW[i] ; for (register int j = 0 ; j < myN ; j++) myEigenVector[i][j] = myZ[i + j*myN] ; } theInvMatrix = myEigenVector ; cDMatrix myAuxMat1 = Diag(myInvEigenValue), myAuxMat2 = Transpose(myEigenVector) ; cDMatrix myAuxMat = myAuxMat1 * myAuxMat2 ; theInvMatrix = theInvMatrix * myAuxMat ; delete myAP ; delete myW ; delete myZ ; delete myWork ; }
void Camera::Precompute() { //********************************************************* //Compute m_mKRt //********************************************************* KRt_ = K_ * Transpose(R_); //********************************************************* //Compute KRtT //********************************************************* KRtT_ = KRt_ * t_; //********************************************************* //Compute VPN //********************************************************* VPN_ = R_.getColumn(2); //********************************************************* //Compute VPd //********************************************************* VPd_ = 0.0; for(int i = 0; i < t_.getSize(); i++) { VPd_ += t_(i)*(VPN_(i)*(-1.0)); } //********************************************************* //Compute GPN and GPD //********************************************************* GPD_ = GP_(3); GPN_.setSize(3); for(int i = 0; i < GPN_.getSize(); i++) { GPN_(i) = GP_(i); } }
void SUMMA_TNC ( Orientation orientA, T alpha, const AbstractDistMatrix<T>& APre, const AbstractDistMatrix<T>& BPre, AbstractDistMatrix<T>& CPre ) { DEBUG_CSE const Int sumDim = BPre.Height(); const Int bsize = Blocksize(); const Grid& g = APre.Grid(); DistMatrixReadProxy<T,T,MC,MR> AProx( APre ); DistMatrixReadProxy<T,T,MC,MR> BProx( BPre ); DistMatrixReadWriteProxy<T,T,MC,MR> CProx( CPre ); auto& A = AProx.GetLocked(); auto& B = BProx.GetLocked(); auto& C = CProx.Get(); // Temporary distributions DistMatrix<T,STAR,MC> A1_STAR_MC(g); DistMatrix<T,MR,STAR> B1Trans_MR_STAR(g); A1_STAR_MC.AlignWith( C ); B1Trans_MR_STAR.AlignWith( C ); for( Int k=0; k<sumDim; k+=bsize ) { const Int nb = Min(bsize,sumDim-k); auto A1 = A( IR(k,k+nb), ALL ); auto B1 = B( IR(k,k+nb), ALL ); // C[MC,MR] += alpha (A1[*,MC])^T B1[*,MR] // = alpha (A1^T)[MC,*] B1[*,MR] A1_STAR_MC = A1; Transpose( B1, B1Trans_MR_STAR ); LocalGemm ( orientA, TRANSPOSE, alpha, A1_STAR_MC, B1Trans_MR_STAR, T(1), C ); } }
Vector<double> AncillaryMethods::PlaneToCam(const Camera& camera) { Vector<double> plane = camera.get_GP(); Vector<double> pv(plane(0), plane(1), plane(2)); Matrix<double> cam_rot_trans = Transpose(camera.get_R()); pv = cam_rot_trans * pv; Vector<double> t = cam_rot_trans * camera.get_t(); double d = plane(3) + pv(0)*t(0) + pv(1)*t(1) + pv(2)*t(2); Vector<double> gp_in_camera(4); gp_in_camera(0) = pv(0); gp_in_camera(1) = pv(1); gp_in_camera(2) = pv(2); gp_in_camera(3) = d; return gp_in_camera; }
int main() { double Av[9] = {2.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 2.0, }; Matrix<double> A(3, 3, Av); Matrix<double> B = 0.5 * Transpose(A) * A; printf("Matrix:\n"); B.Print(); char outputFilename[96] = "BMatrix"; B.Save(outputFilename); Matrix<double> Q(3,3); // will hold eigenvectors Matrix<double> Lambda(3,1); // will hold eigenvalues B.SymmetricEigenDecomposition(Q, Lambda); printf("Eigenvalues:\n"); Lambda.Print(); return 0; }
void AugmentedKKT ( const Matrix<Real>& A, const Matrix<Real>& x, const Matrix<Real>& z, Matrix<Real>& J, bool onlyLower ) { EL_DEBUG_CSE const Int m = A.Height(); const Int n = A.Width(); Zeros( J, m+n, m+n ); const IR xInd(0,n), yInd(n,n+m); auto Jxx = J(xInd,xInd); auto Jxy = J(xInd,yInd); auto Jyx = J(yInd,xInd); auto Jyy = J(yInd,yInd); Matrix<Real> d( z ); DiagonalSolve( LEFT, NORMAL, x, d ); Diagonal( Jxx, d ); Jyx = A; if( !onlyLower ) Transpose( A, Jxy ); }
PhaseEnumerationCache ( const Matrix<Field>& B, const Matrix<Base<Field>>& d, const Matrix<Field>& N, const Matrix<Base<Field>>& normUpperBounds, Int batchSize=256, Int blocksize=32, bool useTranspose=true ) : B_(B), d_(d), N_(N), normUpperBounds_(normUpperBounds), foundVector_(false), numQueued_(0), insertionBound_(normUpperBounds.Height()), blocksize_(blocksize), useTranspose_(useTranspose) { Zeros( Y_, N.Height(), batchSize ); if( useTranspose ) Transpose( N, NTrans_ ); }
void SBVAR_symmetric::SetupSBVAR_symmetric(void) { prior_YY=TransposeMultiply(prior_Y,prior_Y); prior_XX=TransposeMultiply(prior_X,prior_X); prior_XY=TransposeMultiply(prior_X,prior_Y); if (flat_prior) log_prior_constant=0.0; else { TDenseMatrix S(n_vars+n_predetermined,n_vars+n_predetermined); S.Insert(0,0,prior_YY); S.Insert(n_vars,0,-prior_XY); S.Insert(0,n_vars,-Transpose(prior_XY)); S.Insert(n_vars,n_vars,prior_XX); log_prior_constant=n_vars*(-0.918938533204673*(n_vars+n_predetermined) + 0.5*LogAbsDeterminant(S)); // 0.918938533204673 = 0.5*ln(2*pi) } prior_YY*=lambda_bar; prior_XX*=lambda_bar; prior_XY*=lambda_bar; log_prior_constant*=lambda_bar; }
void CallFunction(FunctionCall fc) { int function = GetFunction(fc->function); switch (function) { case VAR : NewVariable(fc); break; case NMX : NewMatrix(fc); break; case ADD : Addition(fc); break; case SUB : Substraction(fc); break; case MUL : Multiplication(fc); break; case MSC : Scalar_Mult(fc); break; case EXP : Exponentiation(fc); break; case TRA : Transpose(fc); break; case DET : Determinant(fc); break; case DLU : Decomposition(fc); break; case SOL : Solve(fc); break; case INV : Inversion(fc); break; case RNK : Rank(fc); break; case DSP : Display(fc); break; case NOF : // default default : { if (GetFunction(fc->name)==SPT) SpeedTest(fc); else if (IndexVariable(fc->function)!=-1) NewVariable(fc); else if (IndexMatrix(fc->function)!=-1) NewMatrix(fc); else { printf("\t%s : Function Not Implemented\n", fc->function); fni++; } break; } } if (function!=NOF && function !=VAR) fni = 0; }
//----------------------------------------------------------------------------- // Update // Updates the object // TODO: Pre- and Post- updates? //----------------------------------------------------------------------------- void CView::Update( void ) { RVector4 x, y, z; z = m_vLook = Normalize( m_vLook ); x = m_vRight = Normalize( CrossProduct( m_vUp, z ) ); y = CrossProduct( z, x ); m_mView.r0 = x; m_mView.r1 = y; m_mView.r2 = z; m_mView.r3 = RVector4Zero(); m_mView = Transpose( m_mView ); m_mView.r3 = RVector4( -DotProduct( x, m_vPosition), -DotProduct( y, m_vPosition), -DotProduct( z, m_vPosition), 1.0f ); //z = Normalize( RQuatGetZAxis(m_Transform.orientation) ); //x = Normalize( CrossProduct( RVector3(0.0f,1.0f,0.0f), z ) ); //y = CrossProduct( z, x ); // //m_mView.r0 = Homogonize( x ); //m_mView.r1 = Homogonize( y ); //m_mView.r2 = Homogonize( z ); //m_mView.r3 = RVector4Zero(); //m_mView = Transpose( m_mView ); // //m_mView.r3 = RVector4( -DotProduct( x, m_Transform.position), -DotProduct( y, m_Transform.position), -DotProduct( z, m_Transform.position), 1.0f ); char szCameraData[256] = { 0 }; sprintf( szCameraData, "Pos: %f, %f, %f", m_vPosition.x, m_vPosition.y, m_vPosition.z ); Engine::GetRenderer()->DrawString( 200, 16, szCameraData ); sprintf( szCameraData, "Look: %f, %f, %f", m_vLook.x, m_vLook.y, m_vLook.z ); Engine::GetRenderer()->DrawString( 200, 32, szCameraData ); }
dng_matrix Invert (const dng_matrix &A) { if (A.Rows () < 2 || A.Cols () < 2) { ThrowMatrixMath (); } if (A.Rows () == A.Cols ()) { if (A.Rows () == 3) { return Invert3by3 (A); } return InvertNbyN (A); } else { // Compute the pseudo inverse. dng_matrix B = Transpose (A); return Invert (B * A) * B; } }
/* See Waggoner and Zha, "A Gibbs sampler for structural vector autoregressions", JEDC 2003, for discription of notations. We take the square root of a symmetric and positive definite X to be any matrix Y such that Y*Y'=X. Note that this is not the usual definition because we do not require Y to be symmetric and positive definite. */ void SBVAR_symmetric_linear::SetSimulationInfo(void) { if (NumberObservations() == 0) throw dw_exception("SetSimulationInfo(): cannot simulate if no observations"); TDenseMatrix all_YY, all_XY, all_XX; if (flat_prior) { all_YY=YY; all_XY=XY; all_XX=XX; } else { TDenseMatrix all_Y, all_X; all_Y=VCat(sqrt(lambda)*Data(),sqrt(lambda_bar)*prior_Y); all_X=VCat(sqrt(lambda)*PredeterminedData(),sqrt(lambda_bar)*prior_X); all_YY=Transpose(all_Y)*all_Y; all_XY=Transpose(all_X)*all_Y; all_XX=Transpose(all_X)*all_X; } Simulate_SqrtH.resize(n_vars); Simulate_P.resize(n_vars); Simulate_SqrtS.resize(n_vars); Simulate_USqrtS.resize(n_vars); for (int i=n_vars-1; i >= 0; i--) { TDenseMatrix invH=Transpose(V[i])*(all_XX*V[i]); Simulate_SqrtH[i]=Inverse(Cholesky(invH,CHOLESKY_UPPER_TRIANGULAR),SOLVE_UPPER_TRIANGULAR); Simulate_P[i]=Simulate_SqrtH[i]*(Transpose(Simulate_SqrtH[i])*(Transpose(V[i])*(all_XY*U[i]))); Simulate_SqrtS[i]=sqrt(lambda_T)*Inverse(Cholesky(Transpose(U[i])*(all_YY*U[i]) - Transpose(Simulate_P[i])*(invH*Simulate_P[i]),CHOLESKY_UPPER_TRIANGULAR),SOLVE_UPPER_TRIANGULAR); Simulate_USqrtS[i]=U[i]*Simulate_SqrtS[i]; } simulation_info_set=true; }
/* ================= R_SubdividePatchToGrid ================= */ srfGridMesh_t *R_SubdividePatchToGrid(int width, int height, drawVert_t points[MAX_PATCH_SIZE * MAX_PATCH_SIZE]) { int i, j, k, l; drawVert_t prev, next, mid; float len, maxLen; int dir; int t; MAC_STATIC drawVert_t ctrl[MAX_GRID_SIZE][MAX_GRID_SIZE]; float errorTable[2][MAX_GRID_SIZE]; for (i = 0 ; i < width ; i++) { for (j = 0 ; j < height ; j++) { ctrl[j][i] = points[j * width + i]; } } for (dir = 0 ; dir < 2 ; dir++) { for (j = 0 ; j < MAX_GRID_SIZE ; j++) { errorTable[dir][j] = 0; } // horizontal subdivisions for (j = 0 ; j + 2 < width ; j += 2) { // check subdivided midpoints against control points // FIXME: also check midpoints of adjacent patches against the control points // this would basically stitch all patches in the same LOD group together. maxLen = 0; for (i = 0 ; i < height ; i++) { vec3_t midxyz; vec3_t dir; vec3_t projected; float d; // calculate the point on the curve for (l = 0 ; l < 3 ; l++) { midxyz[l] = (ctrl[i][j].xyz[l] + ctrl[i][j + 1].xyz[l] * 2 + ctrl[i][j + 2].xyz[l]) * 0.25f; } // see how far off the line it is // using dist-from-line will not account for internal // texture warping, but it gives a lot less polygons than // dist-from-midpoint VectorSubtract(midxyz, ctrl[i][j].xyz, midxyz); VectorSubtract(ctrl[i][j + 2].xyz, ctrl[i][j].xyz, dir); VectorNormalize(dir); d = DotProduct(midxyz, dir); VectorScale(dir, d, projected); VectorSubtract(midxyz, projected, midxyz); len = VectorLengthSquared(midxyz); // we will do the sqrt later if (len > maxLen) { maxLen = len; } } maxLen = sqrt(maxLen); // if all the points are on the lines, remove the entire columns if (maxLen < 0.1f) { errorTable[dir][j + 1] = 999; continue; } // see if we want to insert subdivided columns if (width + 2 > MAX_GRID_SIZE) { errorTable[dir][j + 1] = 1.0f / maxLen; continue; // can't subdivide any more } if (maxLen <= r_subdivisions->value) { errorTable[dir][j + 1] = 1.0f / maxLen; continue; // didn't need subdivision } errorTable[dir][j + 2] = 1.0f / maxLen; // insert two columns and replace the peak width += 2; for (i = 0 ; i < height ; i++) { LerpDrawVert(&ctrl[i][j], &ctrl[i][j + 1], &prev); LerpDrawVert(&ctrl[i][j + 1], &ctrl[i][j + 2], &next); LerpDrawVert(&prev, &next, &mid); for (k = width - 1 ; k > j + 3 ; k--) { ctrl[i][k] = ctrl[i][k - 2]; } ctrl[i][j + 1] = prev; ctrl[i][j + 2] = mid; ctrl[i][j + 3] = next; } // back up and recheck this set again, it may need more subdivision j -= 2; } Transpose(width, height, ctrl); t = width; width = height; height = t; } // put all the aproximating points on the curve PutPointsOnCurve(ctrl, width, height); // cull out any rows or columns that are colinear for (i = 1 ; i < width - 1 ; i++) { if (errorTable[0][i] != 999) { continue; } for (j = i + 1 ; j < width ; j++) { for (k = 0 ; k < height ; k++) { ctrl[k][j - 1] = ctrl[k][j]; } errorTable[0][j - 1] = errorTable[0][j]; } width--; } for (i = 1 ; i < height - 1 ; i++) { if (errorTable[1][i] != 999) { continue; } for (j = i + 1 ; j < height ; j++) { for (k = 0 ; k < width ; k++) { ctrl[j - 1][k] = ctrl[j][k]; } errorTable[1][j - 1] = errorTable[1][j]; } height--; } // flip for longest tristrips as an optimization // the results should be visually identical with or // without this step if (height > width) { Transpose(width, height, ctrl); InvertErrorTable(errorTable, width, height); t = width; width = height; height = t; InvertCtrl(width, height, ctrl); } // calculate normals MakeMeshNormals(width, height, ctrl); return R_CreateSurfaceGridMesh(width, height, ctrl, errorTable); }
inline void TrmmLLTCOld ( Orientation orientation, UnitOrNonUnit diag, T alpha, const DistMatrix<T>& L, DistMatrix<T>& X ) { #ifndef RELEASE PushCallStack("internal::TrmmLLTCOld"); if( L.Grid() != X.Grid() ) throw std::logic_error ("L and X must be distributed over the same grid"); if( orientation == NORMAL ) throw std::logic_error("TrmmLLT expects a (Conjugate)Transpose option"); if( L.Height() != L.Width() || L.Height() != X.Height() ) { std::ostringstream msg; msg << "Nonconformal TrmmLLTC: \n" << " L ~ " << L.Height() << " x " << L.Width() << "\n" << " X ~ " << X.Height() << " x " << X.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } #endif const Grid& g = L.Grid(); // Matrix views DistMatrix<T> LTL(g), LTR(g), L00(g), L01(g), L02(g), LBL(g), LBR(g), L10(g), L11(g), L12(g), L20(g), L21(g), L22(g); DistMatrix<T> XT(g), X0(g), XB(g), X1(g), X2(g); // Temporary distributions DistMatrix<T,STAR,STAR> L11_STAR_STAR(g); DistMatrix<T,MC, STAR> L21_MC_STAR(g); DistMatrix<T,STAR,VR > X1_STAR_VR(g); DistMatrix<T,MR, STAR> D1AdjOrTrans_MR_STAR(g); DistMatrix<T,MR, MC > D1AdjOrTrans_MR_MC(g); DistMatrix<T,MC, MR > D1(g); // Start the algorithm Scale( alpha, X ); LockedPartitionDownDiagonal ( L, LTL, LTR, LBL, LBR, 0 ); PartitionDown ( X, XT, XB, 0 ); while( XB.Height() > 0 ) { LockedRepartitionDownDiagonal ( LTL, /**/ LTR, L00, /**/ L01, L02, /*************/ /******************/ /**/ L10, /**/ L11, L12, LBL, /**/ LBR, L20, /**/ L21, L22 ); RepartitionDown ( XT, X0, /**/ /**/ X1, XB, X2 ); L21_MC_STAR.AlignWith( X2 ); D1AdjOrTrans_MR_STAR.AlignWith( X1 ); D1AdjOrTrans_MR_MC.AlignWith( X1 ); D1.AlignWith( X1 ); Zeros( X1.Width(), X1.Height(), D1AdjOrTrans_MR_STAR ); Zeros( X1.Height(), X1.Width(), D1 ); //--------------------------------------------------------------------// X1_STAR_VR = X1; L11_STAR_STAR = L11; LocalTrmm ( LEFT, LOWER, orientation, diag, T(1), L11_STAR_STAR, X1_STAR_VR ); X1 = X1_STAR_VR; L21_MC_STAR = L21; LocalGemm ( orientation, NORMAL, T(1), X2, L21_MC_STAR, T(0), D1AdjOrTrans_MR_STAR ); D1AdjOrTrans_MR_MC.SumScatterFrom( D1AdjOrTrans_MR_STAR ); if( orientation == TRANSPOSE ) Transpose( D1AdjOrTrans_MR_MC.LocalMatrix(), D1.LocalMatrix() ); else Adjoint( D1AdjOrTrans_MR_MC.LocalMatrix(), D1.LocalMatrix() ); Axpy( T(1), D1, X1 ); //--------------------------------------------------------------------// D1.FreeAlignments(); D1AdjOrTrans_MR_MC.FreeAlignments(); D1AdjOrTrans_MR_STAR.FreeAlignments(); L21_MC_STAR.FreeAlignments(); SlideLockedPartitionDownDiagonal ( LTL, /**/ LTR, L00, L01, /**/ L02, /**/ L10, L11, /**/ L12, /*************/ /******************/ LBL, /**/ LBR, L20, L21, /**/ L22 ); SlidePartitionDown ( XT, X0, X1, /**/ /**/ XB, X2 ); } #ifndef RELEASE PopCallStack(); #endif }
int main(int argc, char* argv[]) { // Various examples. int i; long tick, base = 0; long ticks[SAMPLES]; char* testname; // We are looking for the best value among SAMPLES to // eliminate cache delays and effects of cpuid variable timing. testname = "base"; START_MEASUREMENTS; END_MEASUREMENTS; base = Duration(ticks); // time required to count processor clocks // ================ // SMLXMatrix Tests // ================ SMLXMatrix m1(3, 3); // declare initial size SMLXMatrix m2(3, 3); SMLXMatrix m3; // size will be set by '=' SMLXSpatialVector v1(11, 22, 33); SMLXSpatialVector v2; testname = "3x3 * 3x1"; m1.Set(2.0); m2.Set(3.0); m1[1][2] = m1[2][1] = 3; START_MEASUREMENTS; v2 = m1 * v1; END_MEASUREMENTS; // m1.Output("m1"); // v1.Output("v1"); // v2.Output("m1 * v1"); testname = "3x3 * 3x3"; START_MEASUREMENTS; m3 = m1 * m2; END_MEASUREMENTS; // m1.Output("m1"); // m2.Output("m2"); // m3.Output("m1 * m2"); testname = "6x6 * 6x6"; m1.Resize(6, 6); m2.Resize(6, 6); m1.Set(1); m2.Set(2); m1[0][5] = 10; START_MEASUREMENTS; m3 = m1 * m2; END_MEASUREMENTS; // m1.Output("m1"); // m2.Output("m2"); // m3.Output("m1 * m2"); testname = "6x6 * Transpose(6x6)"; START_MEASUREMENTS; m3 = m1 * Transpose(m2); END_MEASUREMENTS; testname = "6x6 + 6x6"; START_MEASUREMENTS; m3 = m1 + m3; END_MEASUREMENTS; testname = "6x6 * 6x6 - Transpose(6x6) * 6x6 - 6x6"; START_MEASUREMENTS; m3 = m1 * m2 - Transpose(m3) * m1 - m2; END_MEASUREMENTS; // Assuming non-zero diagonal... testname = "Invert Without Pivoting(6x6)"; m1.Identity(); m1[0][0] = 10; m1[3][4] = 2; m3 = m1; START_MEASUREMENTS; m1.Invert(); END_MEASUREMENTS; // m3 = m3 - m1; // discard even number of inversions // m3.Output("m3"); // General case. testname = "Invert With Pivoting(6x6)"; START_MEASUREMENTS; m1.GenericInvert(); END_MEASUREMENTS; // ================= // SMLMatrix3f tests // ================= // TransformPoint and Multiply are inlined for SMLMatrix3f, // so timing is not entirely correct // (some subexpressions are optimized out of loop)... testname = "TransformPoint 3x3"; m1.Resize(3, 3); SMLMatrix3f m33_1 = (const SMLMatrix3f&) m1; SMLMatrix3f m33_2(m33_1); SMLMatrix3f m33_3; SMLVec3f v3_1(11, 22, 33); SMLVec3f v3_2; m33_2.Set(1, 2, 3.0); START_MEASUREMENTS; m33_1.TransformPoint(v3_1, v3_2); END_MEASUREMENTS; // m33_1.Output("m1"); // report("m1 * v1 = {%f, %f, %f}", v3_2.x, v3_2.y, v3_2.z); testname = "Multiply 3x3"; START_MEASUREMENTS; m33_3.Multiply(m33_1, m33_2); END_MEASUREMENTS; // ================= // SMLMatrix4f tests // ================= testname = "Transform 4x4"; m1.Resize(4, 4); SMLMatrix4f m44_1 = (const SMLMatrix4f&) m1; SMLMatrix4f m44_2(m44_1); SMLMatrix4f m44_3; SMLVec4f v4_1(11, 22, 33, 44); SMLVec4f v4_2; m44_2.Set(1, 2, 3.0); START_MEASUREMENTS; m44_1.Transform(v4_1, v4_2); END_MEASUREMENTS; // m44_1.Output("m1"); // report("m1 * v1 = {%f, %f, %f, %f}", v4_2.x, v4_2.y, v4_2.z, v4_2.w); testname = "TransformPoint 4x4"; START_MEASUREMENTS; m44_1.TransformPoint(v3_1, v3_2); END_MEASUREMENTS; testname = "TransformVector 4x4"; START_MEASUREMENTS; m44_1.TransformPoint(v3_1, v3_2); END_MEASUREMENTS; testname = "Multiply 4x4"; START_MEASUREMENTS; m44_3.Multiply(m44_1, m44_2); END_MEASUREMENTS; return 0; }
inline void Symv ( UpperOrLower uplo, T alpha, const DistMatrix<T>& A, const DistMatrix<T>& x, T beta, DistMatrix<T>& y, bool conjugate=false ) { #ifndef RELEASE CallStackEntry entry("Symv"); if( A.Grid() != x.Grid() || x.Grid() != y.Grid() ) throw std::logic_error ("{A,x,y} must be distributed over the same grid"); if( A.Height() != A.Width() ) throw std::logic_error("A must be square"); if( ( x.Width() != 1 && x.Height() != 1 ) || ( y.Width() != 1 && y.Height() != 1 ) ) throw std::logic_error("x and y are assumed to be vectors"); const int xLength = ( x.Width()==1 ? x.Height() : x.Width() ); const int yLength = ( y.Width()==1 ? y.Height() : y.Width() ); if( A.Height() != xLength || A.Height() != yLength ) { std::ostringstream msg; msg << "Nonconformal Symv: \n" << " A ~ " << A.Height() << " x " << A.Width() << "\n" << " x ~ " << x.Height() << " x " << x.Width() << "\n" << " y ~ " << y.Height() << " x " << y.Width() << "\n"; throw std::logic_error( msg.str() ); } #endif const Grid& g = A.Grid(); if( x.Width() == 1 && y.Width() == 1 ) { // Temporary distributions DistMatrix<T,MC,STAR> x_MC_STAR(g), z_MC_STAR(g); DistMatrix<T,MR,STAR> x_MR_STAR(g), z_MR_STAR(g); DistMatrix<T,MR,MC > z_MR_MC(g); DistMatrix<T> z(g); // Begin the algoritm Scale( beta, y ); x_MC_STAR.AlignWith( A ); x_MR_STAR.AlignWith( A ); z_MC_STAR.AlignWith( A ); z_MR_STAR.AlignWith( A ); z.AlignWith( y ); Zeros( z_MC_STAR, y.Height(), 1 ); Zeros( z_MR_STAR, y.Height(), 1 ); //--------------------------------------------------------------------// x_MC_STAR = x; x_MR_STAR = x_MC_STAR; if( uplo == LOWER ) { internal::LocalSymvColAccumulateL ( alpha, A, x_MC_STAR, x_MR_STAR, z_MC_STAR, z_MR_STAR, conjugate ); } else { internal::LocalSymvColAccumulateU ( alpha, A, x_MC_STAR, x_MR_STAR, z_MC_STAR, z_MR_STAR, conjugate ); } z_MR_MC.SumScatterFrom( z_MR_STAR ); z = z_MR_MC; z.SumScatterUpdate( T(1), z_MC_STAR ); Axpy( T(1), z, y ); //--------------------------------------------------------------------// x_MC_STAR.FreeAlignments(); x_MR_STAR.FreeAlignments(); z_MC_STAR.FreeAlignments(); z_MR_STAR.FreeAlignments(); z.FreeAlignments(); } else if( x.Width() == 1 ) { // Temporary distributions DistMatrix<T,MC,STAR> x_MC_STAR(g), z_MC_STAR(g); DistMatrix<T,MR,STAR> x_MR_STAR(g), z_MR_STAR(g); DistMatrix<T,MR,MC > z_MR_MC(g); DistMatrix<T> z(g), zTrans(g); // Begin the algoritm Scale( beta, y ); x_MC_STAR.AlignWith( A ); x_MR_STAR.AlignWith( A ); z_MC_STAR.AlignWith( A ); z_MR_STAR.AlignWith( A ); z.AlignWith( y ); z_MR_MC.AlignWith( y ); Zeros( z_MC_STAR, y.Width(), 1 ); Zeros( z_MR_STAR, y.Width(), 1 ); //--------------------------------------------------------------------// x_MC_STAR = x; x_MR_STAR = x_MC_STAR; if( uplo == LOWER ) { internal::LocalSymvColAccumulateL ( alpha, A, x_MC_STAR, x_MR_STAR, z_MC_STAR, z_MR_STAR, conjugate ); } else { internal::LocalSymvColAccumulateU ( alpha, A, x_MC_STAR, x_MR_STAR, z_MC_STAR, z_MR_STAR, conjugate ); } z.SumScatterFrom( z_MC_STAR ); z_MR_MC = z; z_MR_MC.SumScatterUpdate( T(1), z_MR_STAR ); Transpose( z_MR_MC, zTrans ); Axpy( T(1), zTrans, y ); //--------------------------------------------------------------------// x_MC_STAR.FreeAlignments(); x_MR_STAR.FreeAlignments(); z_MC_STAR.FreeAlignments(); z_MR_STAR.FreeAlignments(); z.FreeAlignments(); z_MR_MC.FreeAlignments(); } else if( y.Width() == 1 ) { // Temporary distributions DistMatrix<T,STAR,MC> x_STAR_MC(g), z_STAR_MC(g); DistMatrix<T,STAR,MR> x_STAR_MR(g), z_STAR_MR(g); DistMatrix<T,MR, MC> z_MR_MC(g); DistMatrix<T> z(g), zTrans(g); // Begin the algoritm Scale( beta, y ); x_STAR_MC.AlignWith( A ); x_STAR_MR.AlignWith( A ); z_STAR_MC.AlignWith( A ); z_STAR_MR.AlignWith( A ); z.AlignWith( y ); z_MR_MC.AlignWith( y ); Zeros( z_STAR_MC, 1, y.Height() ); Zeros( z_STAR_MR, 1, y.Height() ); //--------------------------------------------------------------------// x_STAR_MR = x; x_STAR_MC = x_STAR_MR; if( uplo == LOWER ) { internal::LocalSymvRowAccumulateL ( alpha, A, x_STAR_MC, x_STAR_MR, z_STAR_MC, z_STAR_MR, conjugate ); } else { internal::LocalSymvRowAccumulateU ( alpha, A, x_STAR_MC, x_STAR_MR, z_STAR_MC, z_STAR_MR, conjugate ); } z.SumScatterFrom( z_STAR_MR ); z_MR_MC = z; z_MR_MC.SumScatterUpdate( T(1), z_STAR_MC ); Transpose( z_MR_MC, zTrans ); Axpy( T(1), zTrans, y ); //--------------------------------------------------------------------// x_STAR_MC.FreeAlignments(); x_STAR_MR.FreeAlignments(); z_STAR_MC.FreeAlignments(); z_STAR_MR.FreeAlignments(); z.FreeAlignments(); z_MR_MC.FreeAlignments(); } else { // Temporary distributions DistMatrix<T,STAR,MC> x_STAR_MC(g), z_STAR_MC(g); DistMatrix<T,STAR,MR> x_STAR_MR(g), z_STAR_MR(g); DistMatrix<T,MR, MC> z_MR_MC(g); DistMatrix<T> z(g); // Begin the algoritm Scale( beta, y ); x_STAR_MC.AlignWith( A ); x_STAR_MR.AlignWith( A ); z_STAR_MC.AlignWith( A ); z_STAR_MR.AlignWith( A ); z.AlignWith( y ); z_MR_MC.AlignWith( y ); Zeros( z_STAR_MC, 1, y.Width() ); Zeros( z_STAR_MR, 1, y.Width() ); //--------------------------------------------------------------------// x_STAR_MR = x; x_STAR_MC = x_STAR_MR; if( uplo == LOWER ) { internal::LocalSymvRowAccumulateL ( alpha, A, x_STAR_MC, x_STAR_MR, z_STAR_MC, z_STAR_MR, conjugate ); } else { internal::LocalSymvRowAccumulateU ( alpha, A, x_STAR_MC, x_STAR_MR, z_STAR_MC, z_STAR_MR, conjugate ); } z_MR_MC.SumScatterFrom( z_STAR_MC ); z = z_MR_MC; z.SumScatterUpdate( T(1), z_STAR_MR ); Axpy( T(1), z, y ); //--------------------------------------------------------------------// x_STAR_MC.FreeAlignments(); x_STAR_MR.FreeAlignments(); z_STAR_MC.FreeAlignments(); z_STAR_MR.FreeAlignments(); z.FreeAlignments(); z_MR_MC.FreeAlignments(); } }
inline void TrmmLLNCOld ( UnitOrNonUnit diag, T alpha, const DistMatrix<T>& L, DistMatrix<T>& X ) { #ifndef RELEASE CallStackEntry entry("internal::TrmmLLNCOld"); if( L.Grid() != X.Grid() ) throw std::logic_error ("L and X must be distributed over the same grid"); if( L.Height() != L.Width() || L.Width() != X.Height() ) { std::ostringstream msg; msg << "Nonconformal TrmmLLNC: \n" << " L ~ " << L.Height() << " x " << L.Width() << "\n" << " X ~ " << X.Height() << " x " << X.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } #endif const Grid& g = L.Grid(); // Matrix views DistMatrix<T> LTL(g), LTR(g), L00(g), L01(g), L02(g), LBL(g), LBR(g), L10(g), L11(g), L12(g), L20(g), L21(g), L22(g); DistMatrix<T> XT(g), X0(g), XB(g), X1(g), X2(g); // Temporary distributions DistMatrix<T,STAR,MC > L10_STAR_MC(g); DistMatrix<T,STAR,STAR> L11_STAR_STAR(g); DistMatrix<T,STAR,VR > X1_STAR_VR(g); DistMatrix<T,MR, STAR> D1Trans_MR_STAR(g); DistMatrix<T,MR, MC > D1Trans_MR_MC(g); DistMatrix<T,MC, MR > D1(g); // Start the algorithm Scale( alpha, X ); LockedPartitionUpDiagonal ( L, LTL, LTR, LBL, LBR, 0 ); PartitionUp ( X, XT, XB, 0 ); while( XT.Height() > 0 ) { LockedRepartitionUpDiagonal ( LTL, /**/ LTR, L00, L01, /**/ L02, /**/ L10, L11, /**/ L12, /*************/ /******************/ LBL, /**/ LBR, L20, L21, /**/ L22 ); RepartitionUp ( XT, X0, X1, /**/ /**/ XB, X2 ); L10_STAR_MC.AlignWith( X0 ); D1Trans_MR_STAR.AlignWith( X1 ); D1Trans_MR_MC.AlignWith( X1 ); D1.AlignWith( X1 ); //--------------------------------------------------------------------// L11_STAR_STAR = L11; X1_STAR_VR = X1; LocalTrmm( LEFT, LOWER, NORMAL, diag, T(1), L11_STAR_STAR, X1_STAR_VR ); X1 = X1_STAR_VR; L10_STAR_MC = L10; LocalGemm ( TRANSPOSE, TRANSPOSE, T(1), X0, L10_STAR_MC, D1Trans_MR_STAR ); D1Trans_MR_MC.SumScatterFrom( D1Trans_MR_STAR ); Zeros( D1, X1.Height(), X1.Width() ); Transpose( D1Trans_MR_MC.Matrix(), D1.Matrix() ); Axpy( T(1), D1, X1 ); //--------------------------------------------------------------------// D1.FreeAlignments(); D1Trans_MR_MC.FreeAlignments(); D1Trans_MR_STAR.FreeAlignments(); L10_STAR_MC.FreeAlignments(); SlideLockedPartitionUpDiagonal ( LTL, /**/ LTR, L00, /**/ L01, L02, /*************/ /******************/ /**/ L10, /**/ L11, L12, LBL, /**/ LBR, L20, /**/ L21, L22 ); SlidePartitionUp ( XT, X0, /**/ /**/ X1, XB, X2 ); } }
/* ================== ================== */ void Vertex_Lighting( const __int32 n_triangles, const vertex_light_manager_& vertex_light_manager, const float4_ positions[4][3], float4_ colour[4][3] ) { static const float r_screen_scale_x = 1.0f / screen_scale_x; static const float r_screen_scale_y = 1.0f / screen_scale_y; const __m128 attenuation_factor = set_all(800.0f); const __m128 specular_scale = set_all(100.0f); const __m128 diffuse_scale = set_all(20.0f); const __m128 zero = set_all(0.0f); const __m128 one = set_all(1.0f); __m128 r_screen_scale[2]; r_screen_scale[X] = set_all(r_screen_scale_x); r_screen_scale[Y] = set_all(r_screen_scale_y); __m128 screen_shift[2]; screen_shift[X] = set_all(screen_shift_x); screen_shift[Y] = set_all(screen_shift_y); __m128 clip_space_position[3][4]; __m128 vertex_colour[3][4]; for (__int32 i_vertex = 0; i_vertex < 3; i_vertex++) { __m128 vertex_position[4]; for (__int32 i_triangle = 0; i_triangle < n_triangles; i_triangle++) { vertex_position[i_triangle] = load_u(positions[i_triangle][i_vertex].f); vertex_colour[i_vertex][i_triangle] = load_u(colour[i_triangle][i_vertex].f); } Transpose(vertex_position); Transpose(vertex_colour[i_vertex]); __m128 depth = reciprocal(vertex_position[Z]); clip_space_position[i_vertex][X] = ((vertex_position[X] - screen_shift[X]) * r_screen_scale[X]) * depth; clip_space_position[i_vertex][Y] = ((vertex_position[Y] - screen_shift[Y]) * r_screen_scale[Y]) * depth; clip_space_position[i_vertex][Z] = depth; } __m128 a[3]; a[X] = clip_space_position[1][X] - clip_space_position[0][X]; a[Y] = clip_space_position[1][Y] - clip_space_position[0][Y]; a[Z] = clip_space_position[1][Z] - clip_space_position[0][Z]; __m128 b[3]; b[X] = clip_space_position[2][X] - clip_space_position[0][X]; b[Y] = clip_space_position[2][Y] - clip_space_position[0][Y]; b[Z] = clip_space_position[2][Z] - clip_space_position[0][Z]; __m128 normal[4]; normal[X] = (a[Y] * b[Z]) - (a[Z] * b[Y]); normal[Y] = (a[Z] * b[X]) - (a[X] * b[Z]); normal[Z] = (a[X] * b[Y]) - (a[Y] * b[X]); __m128 mag = (normal[X] * normal[X]) + (normal[Y] * normal[Y]) + (normal[Z] * normal[Z]); mag = _mm_rsqrt_ps(mag); normal[X] *= mag; normal[Y] *= mag; normal[Z] *= mag; for (__int32 i_light = 0; i_light < 1; i_light++) { for (__int32 i_vertex = 0; i_vertex < 3; i_vertex++) { __m128 light_position[3]; __m128 light_colour[3]; const float intensity = vertex_light_manager.light_sources[i_light].intensity; for (__int32 i_axis = X; i_axis < W; i_axis++) { light_position[i_axis] = set_all(vertex_light_manager.light_sources[i_light].position.f[i_axis]); light_colour[i_axis] = set_all(vertex_light_manager.light_sources[i_light].colour.f[i_axis] * intensity); } const __m128 extent = set_all(40.0f); __m128i is_valid = set_all(-1); is_valid &= (clip_space_position[i_vertex][X] - light_position[X]) < extent; is_valid &= (clip_space_position[i_vertex][Y] - light_position[Y]) < extent; is_valid &= (clip_space_position[i_vertex][Z] - light_position[Z]) < extent; light_position[X] = set_all(0.0f); light_position[Y] = set_all(0.0f); light_position[Z] = set_all(0.0f); light_colour[X] = set_all(100.0f); light_colour[Y] = set_all(100.0f); light_colour[Z] = set_all(100.0f); __m128 light_ray[3]; light_ray[X] = clip_space_position[i_vertex][X] - light_position[X]; light_ray[Y] = clip_space_position[i_vertex][Y] - light_position[Y]; light_ray[Z] = clip_space_position[i_vertex][Z] - light_position[Z]; __m128 mag = (light_ray[X] * light_ray[X]) + (light_ray[Y] * light_ray[Y]) + (light_ray[Z] * light_ray[Z]); mag = _mm_rsqrt_ps(mag); light_ray[X] *= mag; light_ray[Y] *= mag; light_ray[Z] *= mag; __m128 dot = (normal[X] * light_ray[X]) + (normal[Y] * light_ray[Y]) + (normal[Z] * light_ray[Z]); dot &= dot > zero; dot = (dot * dot) * mag; __m128 distance = set_zero(); for (__int32 i_axis = X; i_axis < W; i_axis++) { __m128 d = light_position[i_axis] - clip_space_position[i_vertex][i_axis]; distance += (d * d); } __m128 scalar = reciprocal(distance) * attenuation_factor; scalar = max_vec(scalar, zero); scalar = min_vec(scalar, one); for (__int32 i_channel = R; i_channel < A; i_channel++) { vertex_colour[i_vertex][i_channel] += dot * specular_scale * light_colour[i_channel]; vertex_colour[i_vertex][i_channel] += mag * diffuse_scale * light_colour[i_channel]; } } } for (__int32 i_vertex = 0; i_vertex < 3; i_vertex++) { Transpose(vertex_colour[i_vertex]); for (__int32 i_triangle = 0; i_triangle < n_triangles; i_triangle++) { store_u(vertex_colour[i_vertex][i_triangle], colour[i_triangle][i_vertex].f); } } }
/* ================== ================== */ void Vertex_Lighting_REM( const __int32 n_triangles, const vertex_light_manager_& vertex_light_manager, const float4_ positions[4][3], float4_ colour[4][3] ) { //const __int32 VERTEX_COLOUR = FIRST_ATTRIBUTE + 0; static const float r_screen_scale_x = 1.0f / screen_scale_x; static const float r_screen_scale_y = 1.0f / screen_scale_y; //const __m128 attenuation_factor = set_all(200.0f); //const __m128 attenuation_factor = set_all(800.0f); //const __m128 specular_scale = set_all(100.0f); //const __m128 diffuse_scale = set_all(20.0f); __m128 r_screen_scale[2]; r_screen_scale[X] = set_all(r_screen_scale_x); r_screen_scale[Y] = set_all(r_screen_scale_y); __m128 screen_shift[2]; screen_shift[X] = set_all(screen_shift_x); screen_shift[Y] = set_all(screen_shift_y); __m128 clip_space_position[3][4]; //__m128 vertex_colour[3][4]; float4_ new_position[4][3]; for (__int32 i_vertex = 0; i_vertex < 3; i_vertex++) { __m128 vertex_position[4]; for (__int32 i_triangle = 0; i_triangle < n_triangles; i_triangle++) { vertex_position[i_triangle] = load_u(positions[i_triangle][i_vertex].f); //vertex_colour[i_vertex][i_triangle] = load_u(colour[i_triangle][i_vertex].f); } Transpose(vertex_position); //Transpose(vertex_colour[i_vertex]); __m128 depth = reciprocal(vertex_position[Z]); clip_space_position[i_vertex][X] = ((vertex_position[X] - screen_shift[X]) * r_screen_scale[X]) * depth; clip_space_position[i_vertex][Y] = ((vertex_position[Y] - screen_shift[Y]) * r_screen_scale[Y]) * depth; clip_space_position[i_vertex][Z] = depth; } __m128 a[3]; a[X] = clip_space_position[1][X] - clip_space_position[0][X]; a[Y] = clip_space_position[1][Y] - clip_space_position[0][Y]; a[Z] = clip_space_position[1][Z] - clip_space_position[0][Z]; __m128 b[3]; b[X] = clip_space_position[2][X] - clip_space_position[0][X]; b[Y] = clip_space_position[2][Y] - clip_space_position[0][Y]; b[Z] = clip_space_position[2][Z] - clip_space_position[0][Z]; __m128 normal[4]; normal[X] = (a[Y] * b[Z]) - (a[Z] * b[Y]); normal[Y] = (a[Z] * b[X]) - (a[X] * b[Z]); normal[Z] = (a[X] * b[Y]) - (a[Y] * b[X]); __m128 mag = (normal[X] * normal[X]) + (normal[Y] * normal[Y]) + (normal[Z] * normal[Z]); mag = _mm_rsqrt_ps(mag); normal[X] *= mag; normal[Y] *= mag; normal[Z] *= mag; float normal_4[3][4]; store_u(normal[X], normal_4[X]); store_u(normal[Y], normal_4[Y]); store_u(normal[Z], normal_4[Z]); float centre_4[3][4]; float extent_4[3][4]; const __m128 half = set_all(0.5f); for (__int32 i_axis = X; i_axis < W; i_axis++) { __m128 max; __m128 min; max = min = clip_space_position[0][i_axis]; max = max_vec(max_vec(max, clip_space_position[1][i_axis]), clip_space_position[2][i_axis]); min = min_vec(min_vec(min, clip_space_position[1][i_axis]), clip_space_position[2][i_axis]); store_u((max + min) * half, centre_4[i_axis]); store_u((max - min) * half, extent_4[i_axis]); } for (__int32 i_vertex = 0; i_vertex < 3; i_vertex++) { Transpose(clip_space_position[i_vertex]); for (__int32 i_triangle = 0; i_triangle < n_triangles; i_triangle++) { store_u(clip_space_position[i_vertex][i_triangle], new_position[i_triangle][i_vertex].f); } } const __m128 zero = set_all(0.0f); const __m128 one = set_all(1.0f); enum { MAX_LIGHTS_PER_VERTEX = 128, }; for (__int32 i_triangle = 0; i_triangle < n_triangles; i_triangle++) { __m128 centre[3]; __m128 extent[3]; for (__int32 i_axis = X; i_axis < W; i_axis++) { centre[i_axis] = set_all(centre_4[i_axis][i_triangle]); extent[i_axis] = set_all(extent_4[i_axis][i_triangle]); } float z_min = centre_4[Z][i_triangle] - extent_4[Z][i_triangle]; float z_max = centre_4[Z][i_triangle] + extent_4[Z][i_triangle]; __int32 bin_min = __int32(z_min / vertex_light_manager.bin_interval); __int32 bin_max = __int32(z_max / vertex_light_manager.bin_interval); bin_min = min(bin_min, vertex_light_manager_::NUM_BINS - 1); bin_max = min(bin_max, vertex_light_manager_::NUM_BINS - 1); bin_min = max(bin_min, 0); bin_max = max(bin_max, 0); //bin_max = bin_max >= 10 ? 0 : bin_max; //printf_s(" %i , %i \n", bin_min, bin_max); __int32 i_lights[MAX_LIGHTS_PER_VERTEX]; __int32 n_lights = 0; { for (__int32 i_bin = bin_min; i_bin <= bin_max; i_bin++) { const vertex_light_manager_::bin_& bin = vertex_light_manager.bin[i_bin]; for (__int32 i_light_4 = 0; i_light_4 < bin.n_lights; i_light_4 += 4) { const __int32 n = min(bin.n_lights - i_light_4, 4); __m128 light_position[4]; for (__int32 i_light = 0; i_light < n; i_light++) { __int32 index = vertex_light_manager.i_light[bin.i_start + i_light_4 + i_light]; light_position[i_light] = load_u(vertex_light_manager.light_sources[index].position.f); } Transpose(light_position); const __m128 light_extent = set_all(100.0f); __m128i is_valid = set_all(-1); is_valid &= abs(centre[X] - light_position[X]) < (extent[X] + light_extent); is_valid &= abs(centre[Y] - light_position[Y]) < (extent[Y] + light_extent); is_valid &= abs(centre[Z] - light_position[Z]) < (extent[Z] + light_extent); unsigned __int32 result_mask = store_mask(is_valid); for (__int32 i_light = 0; i_light < n; i_light++) { __int32 index = vertex_light_manager.i_light[bin.i_start + i_light_4 + i_light]; i_lights[n_lights] = index; n_lights += (result_mask >> i_light) & 0x1; } if (n_lights > MAX_LIGHTS_PER_VERTEX) { n_lights = MAX_LIGHTS_PER_VERTEX; break; } } } } for (__int32 i_vertex = 0; i_vertex < 3; i_vertex++) { __m128 vertex_position[3]; vertex_position[X] = set_all(new_position[i_triangle][i_vertex].x); vertex_position[Y] = set_all(new_position[i_triangle][i_vertex].y); vertex_position[Z] = set_all(new_position[i_triangle][i_vertex].z); __m128 vertex_colour[4]; vertex_colour[R] = set_all(0.0f); vertex_colour[G] = set_all(0.0f); vertex_colour[B] = set_all(0.0f); __m128 normal[3]; normal[X] = set_all(normal_4[X][i_triangle]); normal[Y] = set_all(normal_4[Y][i_triangle]); normal[Z] = set_all(normal_4[Z][i_triangle]); for (__int32 i_light_4 = 0; i_light_4 < n_lights; i_light_4 += 4) { const __int32 n = min(n_lights - i_light_4, 4); __m128 light_position[4]; __m128 light_colour[4]; unsigned __int32 mask = 0x0; float intensity_4[4]; for (__int32 i_light = 0; i_light < n; i_light++) { mask |= 0x1 << i_light; const __int32 index = i_lights[i_light_4 + i_light]; intensity_4[i_light] = vertex_light_manager.light_sources[index].intensity; light_position[i_light] = load_u(vertex_light_manager.light_sources[index].position.f); light_colour[i_light] = load_u(vertex_light_manager.light_sources[index].colour.f); } Transpose(light_position); Transpose(light_colour); __m128 light_intensity = load_u(intensity_4); __m128 light_ray[3]; light_ray[X] = vertex_position[X] - light_position[X]; light_ray[Y] = vertex_position[Y] - light_position[Y]; light_ray[Z] = vertex_position[Z] - light_position[Z]; __m128 mag = (light_ray[X] * light_ray[X]) + (light_ray[Y] * light_ray[Y]) + (light_ray[Z] * light_ray[Z]); __m128 r_mag = _mm_rsqrt_ps(mag); light_ray[X] *= r_mag; light_ray[Y] *= r_mag; light_ray[Z] *= r_mag; __m128 dot = (normal[X] * light_ray[X]) + (normal[Y] * light_ray[Y]) + (normal[Z] * light_ray[Z]); dot &= dot > zero; __m128 r_distance = reciprocal(one + mag); __m128 spec = (dot * dot) * r_distance; static const __m128 specular_coefficient = set_all(2000.0f); static const __m128 diffuse_coefficient = set_all(200.0f); //printf_s(" %f ", dot); __m128i loop_mask = load_mask[mask]; for (__int32 i_channel = R; i_channel < A; i_channel++) { __m128 final = spec * specular_coefficient * light_colour[i_channel] * light_intensity; final += r_distance * diffuse_coefficient * light_colour[i_channel] * light_intensity; vertex_colour[i_channel] += final & loop_mask; } } Transpose(vertex_colour); vertex_colour[0] += vertex_colour[1] + vertex_colour[2] + vertex_colour[3]; float4_ temp; store_u(vertex_colour[0], temp.f); colour[i_triangle][i_vertex].x += temp.x; colour[i_triangle][i_vertex].y += temp.y; colour[i_triangle][i_vertex].z += temp.z; } }
/* * R_SubdividePatchToGrid */ srfGridMesh_t * R_SubdividePatchToGrid(int width, int height, Drawvert points[MAX_PATCH_SIZE*MAX_PATCH_SIZE]) { int i, j, k, l; drawVert_t_cleared(prev); drawVert_t_cleared(next); drawVert_t_cleared(mid); float len, maxLen; int dir; int t; Drawvert ctrl[MAX_GRID_SIZE][MAX_GRID_SIZE]; float errorTable[2][MAX_GRID_SIZE]; for(i = 0; i < width; i++) for(j = 0; j < height; j++) ctrl[j][i] = points[j*width+i]; for(dir = 0; dir < 2; dir++){ for(j = 0; j < MAX_GRID_SIZE; j++) errorTable[dir][j] = 0; /* horizontal subdivisions */ for(j = 0; j + 2 < width; j += 2){ /* check subdivided midpoints against control points */ /* FIXME: also check midpoints of adjacent patches against the control points * this would basically stitch all patches in the same LOD group together. */ maxLen = 0; for(i = 0; i < height; i++){ Vec3 midxyz; Vec3 midxyz2; Vec3 dir; Vec3 projected; float d; /* calculate the point on the curve */ for(l = 0; l < 3; l++) midxyz[l] = (ctrl[i][j].xyz[l] + ctrl[i][j+1].xyz[l] * 2 + ctrl[i][j+2].xyz[l]) * 0.25f; /* see how far off the line it is * using dist-from-line will not account for internal * texture warping, but it gives a lot less polygons than * dist-from-midpoint */ subv3(midxyz, ctrl[i][j].xyz, midxyz); subv3(ctrl[i][j+2].xyz, ctrl[i][j].xyz, dir); normv3(dir); d = dotv3(midxyz, dir); scalev3(dir, d, projected); subv3(midxyz, projected, midxyz2); len = lensqrv3(midxyz2); /* we will do the sqrt later */ if(len > maxLen){ maxLen = len; } } maxLen = sqrt(maxLen); /* if all the points are on the lines, remove the entire columns */ if(maxLen < 0.1f){ errorTable[dir][j+1] = 999; continue; } /* see if we want to insert subdivided columns */ if(width + 2 > MAX_GRID_SIZE){ errorTable[dir][j+1] = 1.0f/maxLen; continue; /* can't subdivide any more */ } if(maxLen <= r_subdivisions->value){ errorTable[dir][j+1] = 1.0f/maxLen; continue; /* didn't need subdivision */ } errorTable[dir][j+2] = 1.0f/maxLen; /* insert two columns and replace the peak */ width += 2; for(i = 0; i < height; i++){ LerpDrawVert(&ctrl[i][j], &ctrl[i][j+1], &prev); LerpDrawVert(&ctrl[i][j+1], &ctrl[i][j+2], &next); LerpDrawVert(&prev, &next, &mid); for(k = width - 1; k > j + 3; k--) ctrl[i][k] = ctrl[i][k-2]; ctrl[i][j + 1] = prev; ctrl[i][j + 2] = mid; ctrl[i][j + 3] = next; } /* back up and recheck this set again, it may need more subdivision */ j -= 2; } Transpose(width, height, ctrl); t = width; width = height; height = t; } /* put all the aproximating points on the curve */ PutPointsOnCurve(ctrl, width, height); /* cull out any rows or columns that are colinear */ for(i = 1; i < width-1; i++){ if(errorTable[0][i] != 999){ continue; } for(j = i+1; j < width; j++){ for(k = 0; k < height; k++) ctrl[k][j-1] = ctrl[k][j]; errorTable[0][j-1] = errorTable[0][j]; } width--; } for(i = 1; i < height-1; i++){ if(errorTable[1][i] != 999){ continue; } for(j = i+1; j < height; j++){ for(k = 0; k < width; k++) ctrl[j-1][k] = ctrl[j][k]; errorTable[1][j-1] = errorTable[1][j]; } height--; } #if 1 /* flip for longest tristrips as an optimization * the results should be visually identical with or * without this step */ if(height > width){ Transpose(width, height, ctrl); InvertErrorTable(errorTable, width, height); t = width; width = height; height = t; InvertCtrl(width, height, ctrl); } #endif /* calculate normals */ MakeMeshNormals(width, height, ctrl); return R_CreateSurfaceGridMesh(width, height, ctrl, errorTable); }
void FoxLi( ElementalMatrix<Complex<Real>>& APre, Int n, Real omega ) { DEBUG_CSE typedef Complex<Real> C; const Real pi = 4*Atan( Real(1) ); const C phi = Sqrt( C(0,omega/pi) ); DistMatrixWriteProxy<C,C,MC,MR> AProx( APre ); auto& A = AProx.Get(); // Compute Gauss quadrature points and weights const Grid& g = A.Grid(); DistMatrix<Real,VR,STAR> d(g), e(g); Zeros( d, n, 1 ); e.Resize( n-1, 1 ); auto& eLoc = e.Matrix(); for( Int iLoc=0; iLoc<e.LocalHeight(); ++iLoc ) { const Int i = e.GlobalRow(iLoc); const Real betaInv = 2*Sqrt(1-Pow(i+Real(1),-2)/4); eLoc(iLoc) = 1/betaInv; } DistMatrix<Real,VR,STAR> x(g); DistMatrix<Real,STAR,VR> Z(g); HermitianTridiagEig( d, e, x, Z, UNSORTED ); auto z = Z( IR(0), ALL ); DistMatrix<Real,STAR,VR> sqrtWeights( z ); auto& sqrtWeightsLoc = sqrtWeights.Matrix(); for( Int jLoc=0; jLoc<sqrtWeights.LocalWidth(); ++jLoc ) sqrtWeightsLoc(0,jLoc) = Sqrt(Real(2))*Abs(sqrtWeightsLoc(0,jLoc)); herm_eig::Sort( x, sqrtWeights, ASCENDING ); // Form the integral operator A.Resize( n, n ); DistMatrix<Real,MC,STAR> x_MC( A.Grid() ); DistMatrix<Real,MR,STAR> x_MR( A.Grid() ); x_MC.AlignWith( A ); x_MR.AlignWith( A ); x_MC = x; x_MR = x; auto& ALoc = A.Matrix(); auto& x_MCLoc = x_MC.Matrix(); auto& x_MRLoc = x_MR.Matrix(); for( Int jLoc=0; jLoc<A.LocalWidth(); ++jLoc ) { for( Int iLoc=0; iLoc<A.LocalHeight(); ++iLoc ) { const Real diff = x_MCLoc(iLoc)-x_MRLoc(jLoc); const Real theta = -omega*Pow(diff,2); const Real realPart = Cos(theta); const Real imagPart = Sin(theta); ALoc(iLoc,jLoc) = phi*C(realPart,imagPart); } } // Apply the weighting DistMatrix<Real,VR,STAR> sqrtWeightsTrans(g); Transpose( sqrtWeights, sqrtWeightsTrans ); DiagonalScale( LEFT, NORMAL, sqrtWeightsTrans, A ); DiagonalScale( RIGHT, NORMAL, sqrtWeightsTrans, A ); }
void DrawBl() { Shader* s = &g_shader[g_curS]; //return; for(int i=0; i<BUILDINGS; i++) { Building* b = &g_building[i]; if(!b->on) continue; const BuildingT* t = &g_bltype[b->type]; //const BuildingT* t = &g_bltype[BUILDING_APARTMENT]; Model* m = &g_model[ t->model ]; Vec3f vmin(b->drawpos.x - t->widthx*TILE_SIZE/2, b->drawpos.y, b->drawpos.z - t->widthz*TILE_SIZE/2); Vec3f vmax(b->drawpos.x + t->widthx*TILE_SIZE/2, b->drawpos.y + (t->widthx+t->widthz)*TILE_SIZE/2, b->drawpos.z + t->widthz*TILE_SIZE/2); if(!g_frustum.boxin2(vmin.x, vmin.y, vmin.z, vmax.x, vmax.y, vmax.z)) continue; if(!b->finished) m = &g_model[ t->cmodel ]; /* m->draw(0, b->drawpos, 0); */ float pitch = 0; float yaw = 0; Matrix modelmat; float radians[] = {(float)DEGTORAD(pitch), (float)DEGTORAD(yaw), 0}; modelmat.translation((const float*)&b->drawpos); Matrix rotation; rotation.rotrad(radians); modelmat.postmult(rotation); glUniformMatrix4fv(s->m_slot[SSLOT_MODELMAT], 1, 0, modelmat.m_matrix); Matrix modelview; #ifdef SPECBUMPSHADOW modelview.set(g_camview.m_matrix); #endif modelview.postmult(modelmat); glUniformMatrix4fv(s->m_slot[SSLOT_MODELVIEW], 1, 0, modelview.m_matrix); Matrix mvp; #if 0 mvp.set(modelview.m_matrix); mvp.postmult(g_camproj); #elif 0 mvp.set(g_camproj.m_matrix); mvp.postmult(modelview); #else mvp.set(g_camproj.m_matrix); mvp.postmult(g_camview); mvp.postmult(modelmat); #endif glUniformMatrix4fv(s->m_slot[SSLOT_MVP], 1, 0, mvp.m_matrix); Matrix modelviewinv; Transpose(modelview, modelview); Inverse2(modelview, modelviewinv); //Transpose(modelviewinv, modelviewinv); glUniformMatrix4fv(s->m_slot[SSLOT_NORMALMAT], 1, 0, modelviewinv.m_matrix); VertexArray* va = &b->drawva; m->usetex(); glVertexAttribPointer(s->m_slot[SSLOT_POSITION], 3, GL_FLOAT, GL_FALSE, 0, va->vertices); glVertexAttribPointer(s->m_slot[SSLOT_TEXCOORD0], 2, GL_FLOAT, GL_FALSE, 0, va->texcoords); if(s->m_slot[SSLOT_NORMAL] != -1) glVertexAttribPointer(s->m_slot[SSLOT_NORMAL], 3, GL_FLOAT, GL_FALSE, 0, va->normals); glDrawArrays(GL_TRIANGLES, 0, va->numverts); } }
// Matrix inverse --------------------------------------------------------------------- static Float4x4 VFunction Inverse(const Float4x4& matrix) { Float4x4 mTransposed = Transpose(matrix); Vector v00 = Permute<0, 0, 1, 1>(mTransposed.z); Vector v10 = Permute<2, 3, 2, 3>(mTransposed.w); Vector v01 = Permute<0, 0, 1, 1>(mTransposed.x); Vector v11 = Permute<2, 3, 2, 3>(mTransposed.y); Vector v02 = Shuffle<0, 2, 0, 2>(mTransposed.z, mTransposed.x); Vector v12 = Shuffle<1, 3, 1, 3>(mTransposed.w, mTransposed.y); Vector d0 = _mm_mul_ps(v00, v10); Vector d1 = _mm_mul_ps(v01, v11); Vector d2 = _mm_mul_ps(v02, v12); v00 = Permute<2, 3, 2, 3>(mTransposed.z); v10 = Permute<0, 0, 1, 1>(mTransposed.w); v01 = Permute<2, 3, 2, 3>(mTransposed.x); v11 = Permute<0, 0, 1, 1>(mTransposed.y); v02 = Shuffle<1, 3, 1, 3>(mTransposed.z, mTransposed.x); v12 = Shuffle<0, 2, 0, 2>(mTransposed.w, mTransposed.y); v00 = _mm_mul_ps(v00, v10); v01 = _mm_mul_ps(v01, v11); v02 = _mm_mul_ps(v02, v12); d0 = _mm_sub_ps(d0, v00); d1 = _mm_sub_ps(d1, v01); d2 = _mm_sub_ps(d2, v02); // v11 = d0.y, d0.w, d2.y, d2.y v11 = Shuffle<1, 3, 1, 1>(d0, d2); v00 = Permute<1, 2, 0, 1>(mTransposed.y); v10 = Shuffle<2, 0, 3, 0>(v11, d0); v01 = Permute<2, 0, 1, 0>(mTransposed.x); v11 = Shuffle<1, 2, 1, 2>(v11, d0); // v13 = D1Y,D1W,D2W,D2W Vector v13 = Shuffle<1, 3, 3, 3>(d1, d2); v02 = Permute<1, 2, 0, 1>(mTransposed.w); v12 = Shuffle<2, 0, 3, 0>(v13, d1); Vector v03 = Permute<2, 0, 1, 0>(mTransposed.z); v13 = Shuffle<1, 2, 1, 2>(v13, d1); Vector c0 = _mm_mul_ps(v00, v10); Vector c2 = _mm_mul_ps(v01, v11); Vector c4 = _mm_mul_ps(v02, v12); Vector c6 = _mm_mul_ps(v03, v13); // v11 = d0X,d0Y,d2X,d2X v11 = Shuffle<0, 1, 0, 0>(d0, d2); v00 = Permute<2, 3, 1, 2>(mTransposed.y); v10 = Shuffle<3, 0, 1, 2>(d0, v11); v01 = Permute<3, 2, 3, 1>(mTransposed.x); v11 = Shuffle<2, 1, 2, 0>(d0, v11); // v13 = d1X,d1Y,d2Z,d2Z v13 = Shuffle<0, 1, 2, 2>(d1, d2); v02 = Permute<2, 3, 1, 2>(mTransposed.w); v12 = Shuffle<3, 0, 1, 2>(d1, v13); v03 = Permute<3, 2, 3, 1>(mTransposed.z); v13 = Shuffle<2, 1, 2, 0>(d1, v13); v00 = _mm_mul_ps(v00, v10); v01 = _mm_mul_ps(v01, v11); v02 = _mm_mul_ps(v02, v12); v03 = _mm_mul_ps(v03, v13); c0 = _mm_sub_ps(c0, v00); c2 = _mm_sub_ps(c2, v01); c4 = _mm_sub_ps(c4, v02); c6 = _mm_sub_ps(c6, v03); v00 = Permute<3, 0, 3, 0>(mTransposed.y); // v10 = d0Z,d0Z,d2X,d2Y v10 = Shuffle<2, 2, 0, 1>(d0, d2); v10 = Permute<0, 3, 2, 0>(v10); v01 = Permute<1, 3, 0, 2>(mTransposed.x); // v11 = d0X,d0W,d2X,d2Y v11 = Shuffle<0, 3, 0, 1>(d0, d2); v11 = Permute<3, 0, 1, 2>(v11); v02 = Permute<3, 0, 3, 0>(mTransposed.w); // v12 = d1Z,d1Z,d2Z,d2W v12 = Shuffle<2, 2, 2, 3>(d1, d2); v12 = Permute<0, 3, 2, 0>(v12); v03 = Permute<1, 3, 0, 2>(mTransposed.z); // v13 = d1X,d1W,d2Z,d2W v13 = Shuffle<0, 3, 2, 3>(d1, d2); v13 = Permute<3, 0, 1, 2>(v13); v00 = _mm_mul_ps(v00, v10); v01 = _mm_mul_ps(v01, v11); v02 = _mm_mul_ps(v02, v12); v03 = _mm_mul_ps(v03, v13); Vector c1 = _mm_sub_ps(c0, v00); c0 = _mm_add_ps(c0, v00); Vector c3 = _mm_add_ps(c2, v01); c2 = _mm_sub_ps(c2, v01); Vector c5 = _mm_sub_ps(c4, v02); c4 = _mm_add_ps(c4, v02); Vector c7 = _mm_add_ps(c6, v03); c6 = _mm_sub_ps(c6, v03); c0 = Shuffle<0, 2, 1, 3>(c0, c1); c2 = Shuffle<0, 2, 1, 3>(c2, c3); c4 = Shuffle<0, 2, 1, 3>(c4, c5); c6 = Shuffle<0, 2, 1, 3>(c6, c7); c0 = Permute<0, 2, 1, 3>(c0); c2 = Permute<0, 2, 1, 3>(c2); c4 = Permute<0, 2, 1, 3>(c4); c6 = Permute<0, 2, 1, 3>(c6); // Get the determinant Vector vTemp = Dot((Float4)c0, mTransposed.x); //if(pDeterminant != nullptr) // *pDeterminant = vTemp; vTemp = _mm_div_ps(Constant::One, vTemp); Float4x4 mResult; mResult.x = _mm_mul_ps(c0, vTemp); mResult.y = _mm_mul_ps(c2, vTemp); mResult.z = _mm_mul_ps(c4, vTemp); mResult.w = _mm_mul_ps(c6, vTemp); return mResult; }
// -- // static Float4x4 VFunction MultiplyTranspose(const Float4x4& matrixA, const Float4x4& matrixB) { return Transpose(Multiply(matrixA, matrixB)); }