void LSLocateCircularInterface::setLevelSetPatchData(int D_idx, Pointer<HierarchyMathOps> hier_math_ops, double /*time*/, bool /*initial_time*/) { // In this version of this class, the initial level set location is set to be // exact since we always know the radius of the ball Pointer<PatchHierarchy<NDIM> > patch_hierarchy = hier_math_ops->getPatchHierarchy(); const int coarsest_ln = 0; const int finest_ln = patch_hierarchy->getFinestLevelNumber(); // Set the initial condition for locating the interface const double& R = d_circle->R; const IBTK::Vector& X0 = d_circle->X0; for (int ln = coarsest_ln; ln <= finest_ln; ++ln) { Pointer<PatchLevel<NDIM> > level = patch_hierarchy->getPatchLevel(ln); for (PatchLevel<NDIM>::Iterator p(level); p; p++) { Pointer<Patch<NDIM> > patch = level->getPatch(p()); const Box<NDIM>& patch_box = patch->getBox(); Pointer<CellData<NDIM, double> > D_data = patch->getPatchData(D_idx); for (Box<NDIM>::Iterator it(patch_box); it; it++) { CellIndex<NDIM> ci(it()); // Get physical coordinates IBTK::Vector coord = IBTK::Vector::Zero(); Pointer<CartesianPatchGeometry<NDIM> > patch_geom = patch->getPatchGeometry(); const double* patch_X_lower = patch_geom->getXLower(); const hier::Index<NDIM>& patch_lower_idx = patch_box.lower(); const double* const patch_dx = patch_geom->getDx(); for (int d = 0; d < NDIM; ++d) { coord[d] = patch_X_lower[d] + patch_dx[d] * (static_cast<double>(ci(d) - patch_lower_idx(d)) + 0.5); } const double distance = std::sqrt(std::pow((coord[0] - X0(0)), 2.0) + std::pow((coord[1] - X0(1)), 2.0) #if (NDIM == 3) + std::pow((coord[2] - X0(2)), 2.0) #endif ); (*D_data)(ci) = distance - R; } } } return; } // setLevelSetPatchData
template <class cCRNode> void cTplCoxRoyAlgo<cCRNode>::SetStdCostRegul(double aCoeff,double aCste,int aVmin) { for (int anX=X0(); anX<X1() ; anX++) for (int anY=Y0(); anY<Y1() ; anY++) for (int aZ = ZMin(anX,anY); aZ< ZMax(anX,anY) ; aZ++) { cRoyPt aP1 (anX,anY,aZ); cCRNode & aS1 = NodeOfP(aP1); int aC1 = aS1.ResidualFlow(mDirZPlus); for (int anEdg=0; anEdg<NbEdges() ; anEdg++) { if (aS1.EdgeIsValide(anEdg) && (!tabCRIsVertical[anEdg])) { cRoyPt aP2(aP1,anEdg); cCRNode & aS2 = NodeOfP(aP2); int aC2 = aS2.ResidualFlow(mDirZPlus); double aCost = aCste + aCoeff*(aC1+aC2)/2.0; if (Cnx8()) aCost *= tabCRIsArcV8[anEdg] ? 0.2928 : 0.4142 ; int iCost = int(aCost+0.5); if (iCost<aVmin) iCost = aVmin; aS1.SetResidualFlow(anEdg,iCost); } } } }
/** An sample for taylor expansion of logdet(X). */ void taylorSample() { std::string ans; char rowChar[5]; int rowTmp = ROW; sprintf(rowChar, "%d", rowTmp); std::string row = rowChar; // Initialize the matrices. symbolic_matrix_type X("X", ROW, COL); symbolic_matrix_type X0("X0", ROW, COL); symbolic_matrix_type Delta("(X-X0)", ROW, COL); AMD::SymbolicScalarMatlab a2("1/2!"); AMD::SymbolicScalarMatlab a3("1/3!"); SymbolicSMFunc r2(a2,ROW,COL); SymbolicSMFunc r3(a3,ROW, COL); // Initialize MatrixMatrixFunction. SymbolicMMFunc fX(X, false); SymbolicMMFunc fX0(X0, false); SymbolicMMFunc fDelta(Delta, true); // Compute Taylor series iteratively. SymbolicSMFunc f0 = logdet(fX0); SymbolicSMFunc f1 = trace(fDelta * transpose(*f0.derivativeFuncVal)); SymbolicSMFunc f2 = trace(fDelta * transpose(*f1.derivativeFuncVal)); SymbolicSMFunc f3 = trace(fDelta * transpose(*f2.derivativeFuncVal)); // Taylor Expansion. SymbolicSMFunc func = f0 + f1 + r2*f2 + r3*f3; std::cout<<"The first 4 terms of Taylor Expansion for logdet(X) around X0 is:"; std::cout << std::endl; std::cout << func.functionVal.getString() << std::endl; }
AR_Process::AR_Process(void) { // State Equation F.resize(1,1); F(0,0) = 0.8; f.resize(1); f(0) = 0.; G.resize(1,1); G.identity(); Qw.resize(1); Qw(0,0)= 0.1; // Observation noise H.resize(1,1); H(0,0) = 1; h.resize(1); h(0) = 0.; Qv.resize(1); Qv(0,0)=1; // Init state X0.resize(1); X0(0) = 10.; R0.resize(1); R0.zero(); }
inline void TrmmLLNA ( UnitOrNonUnit diag, T alpha, const DistMatrix<T>& L, DistMatrix<T>& X ) { #ifndef RELEASE PushCallStack("internal::TrmmLLNA"); if( L.Grid() != X.Grid() ) throw std::logic_error ("L and X must be distributed over the same grid"); if( L.Height() != L.Width() || L.Width() != X.Height() ) { std::ostringstream msg; msg << "Nonconformal TrmmLLNA: \n" << " L ~ " << L.Height() << " x " << L.Width() << "\n" << " X ~ " << X.Height() << " x " << X.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } #endif const Grid& g = L.Grid(); DistMatrix<T> XL(g), XR(g), X0(g), X1(g), X2(g); DistMatrix<T,VR, STAR> X1_VR_STAR(g); DistMatrix<T,STAR,MR > X1Trans_STAR_MR(g); DistMatrix<T,MC, STAR> Z1_MC_STAR(g); X1_VR_STAR.AlignWith( L ); X1Trans_STAR_MR.AlignWith( L ); Z1_MC_STAR.AlignWith( L ); PartitionRight( X, XL, XR, 0 ); while( XL.Width() < X.Width() ) { RepartitionRight ( XL, /**/ XR, X0, /**/ X1, X2 ); Zeros( X1.Height(), X1.Width(), Z1_MC_STAR ); //--------------------------------------------------------------------// X1_VR_STAR = X1; X1Trans_STAR_MR.TransposeFrom( X1_VR_STAR ); LocalTrmmAccumulateLLN ( TRANSPOSE, diag, alpha, L, X1Trans_STAR_MR, Z1_MC_STAR ); X1.SumScatterFrom( Z1_MC_STAR ); //--------------------------------------------------------------------// SlidePartitionRight ( XL, /**/ XR, X0, X1, /**/ X2 ); } #ifndef RELEASE PopCallStack(); #endif }
std::pair<int,double> slice_sample_multi(double x0, vector<slice_function*>& g, double w, int m) { int N = g.size(); vector<double> X0(N,x0); return slice_sample_multi(X0,g,w,m); }
inline void TrmmRUNA ( UnitOrNonUnit diag, T alpha, const DistMatrix<T>& U, DistMatrix<T>& X ) { #ifndef RELEASE CallStackEntry entry("internal::TrmmRUNA"); if( U.Grid() != X.Grid() ) throw std::logic_error("{U,X} must be distributed over the same grid"); #endif const Grid& g = U.Grid(); DistMatrix<T> XT(g), X0(g), XB(g), X1(g), X2(g); DistMatrix<T,STAR,VC > X1_STAR_VC(g); DistMatrix<T,STAR,MC > X1_STAR_MC(g); DistMatrix<T,MR, STAR> Z1Trans_MR_STAR(g); DistMatrix<T,MR, MC > Z1Trans_MR_MC(g); X1_STAR_VC.AlignWith( U ); X1_STAR_MC.AlignWith( U ); Z1Trans_MR_STAR.AlignWith( U ); PartitionDown ( X, XT, XB, 0 ); while( XT.Height() < X.Height() ) { RepartitionDown ( XT, X0, /**/ /**/ X1, XB, X2 ); Z1Trans_MR_MC.AlignWith( X1 ); //--------------------------------------------------------------------// X1_STAR_VC = X1; X1_STAR_MC = X1_STAR_VC; Zeros( Z1Trans_MR_STAR, X1.Width(), X1.Height() ); LocalTrmmAccumulateRUN ( TRANSPOSE, diag, alpha, U, X1_STAR_MC, Z1Trans_MR_STAR ); Z1Trans_MR_MC.SumScatterFrom( Z1Trans_MR_STAR ); Transpose( Z1Trans_MR_MC.Matrix(), X1.Matrix() ); //--------------------------------------------------------------------// Z1Trans_MR_MC.FreeAlignments(); SlidePartitionDown ( XT, X0, X1, /**/ /**/ XB, X2 ); } }
Van_Der_Pol::Van_Der_Pol(void) { lambda = 3.; Qw.resize(1); Qw(0,0)= 1.; Qv.resize(1); Qv(0,0)=0.1; X0.resize(2); X0(0) = 0.5; X0(1) = 0.5; R0.resize(2); R0.zero(); R0(0,0)=0.; R0(1,1)=.1; Ts=.1; }
bool ACovarianceFunction::check_covariance_Kgrad_x(ACovarianceFunction& covar,mfloat_t relchange,mfloat_t threshold,bool check_diag) { mfloat_t RV=0; //copy inputs for which we calculate gradients CovarInput X = covar.getX(); CovarInput X0 = X; for (int ic=0;ic<X.cols();ic++) { //analytical gradient is per columns all in one go: MatrixXd Kgrad_x = covar.Kgrad_X(ic); MatrixXd Kgrad_x_diag = covar.Kdiag_grad_X(ic); for (int ir=0;ir<X.rows();ir++) { mfloat_t change = relchange*X0(ir,ic); change = std::max(change,1E-5); X(ir,ic) = X0(ir,ic) + change; covar.setX(X); MatrixXd Lplus = covar.K(); X(ir,ic) = X0(ir,ic) - change; covar.setX(X); MatrixXd Lminus = covar.K(); X(ir,ic) = X0(ir,ic); covar.setX(X); //numerical gradient MatrixXd diff_numerical = (Lplus-Lminus)/(2.*change); //build analytical gradient matrix MatrixXd diff_analytical = MatrixXd::Zero(X.rows(),X.rows()); diff_analytical.row(ir) = Kgrad_x.row(ir); diff_analytical.col(ir) += Kgrad_x.row(ir); RV+= (diff_numerical-diff_analytical).squaredNorm(); //difference if (check_diag) { double delta =(diff_numerical(ir,ir)-Kgrad_x_diag(ir)); RV+= delta*delta; } } //end for ir } return (RV < threshold); }
Foam::tmp<Foam::pointField> Foam::RBD::rigidBodyMotion::transformPoints ( const label bodyID, const pointField& initialPoints ) const { // Calculate the transform from the initial state in the global frame // to the current state in the global frame spatialTransform X(X0(bodyID).inv() & X00(bodyID)); tmp<pointField> tpoints(new pointField(initialPoints.size())); pointField& points = tpoints.ref(); forAll(points, i) { points[i] = X.transformPoint(initialPoints[i]); }
inline void TrsmLUNSmall ( UnitOrNonUnit diag, F alpha, const DistMatrix<F,VC,STAR>& U, DistMatrix<F,VC,STAR>& X, bool checkIfSingular ) { #ifndef RELEASE PushCallStack("internal::TrsmLUNSmall"); if( U.Grid() != X.Grid() ) throw std::logic_error ("U and X must be distributed over the same grid"); if( U.Height() != U.Width() || U.Width() != X.Height() ) { std::ostringstream msg; msg << "Nonconformal TrsmLUN: \n" << " U ~ " << U.Height() << " x " << U.Width() << "\n" << " X ~ " << X.Height() << " x " << X.Width() << "\n"; throw std::logic_error( msg.str() ); } if( U.ColAlignment() != X.ColAlignment() ) throw std::logic_error("U and X are assumed to be aligned"); #endif const Grid& g = U.Grid(); // Matrix views DistMatrix<F,VC,STAR> UTL(g), UTR(g), U00(g), U01(g), U02(g), UBL(g), UBR(g), U10(g), U11(g), U12(g), U20(g), U21(g), U22(g); DistMatrix<F,VC,STAR> XT(g), X0(g), XB(g), X1(g), X2(g); // Temporary distributions DistMatrix<F,STAR,STAR> U11_STAR_STAR(g); DistMatrix<F,STAR,STAR> X1_STAR_STAR(g); // Start the algorithm Scale( alpha, X ); LockedPartitionUpDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); PartitionUp ( X, XT, XB, 0 ); while( XT.Height() > 0 ) { LockedRepartitionUpDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); RepartitionUp ( XT, X0, X1, /**/ /**/ XB, X2 ); //--------------------------------------------------------------------// U11_STAR_STAR = U11; // U11[* ,* ] <- U11[VC,* ] X1_STAR_STAR = X1; // X1[* ,* ] <- X1[VC,* ] // X1[* ,* ] := U11^-1[* ,* ] X1[* ,* ] LocalTrsm ( LEFT, UPPER, NORMAL, diag, F(1), U11_STAR_STAR, X1_STAR_STAR, checkIfSingular ); X1 = X1_STAR_STAR; // X0[VC,* ] -= U01[VC,* ] X1[* ,* ] LocalGemm( NORMAL, NORMAL, F(-1), U01, X1_STAR_STAR, F(1), X0 ); //--------------------------------------------------------------------// SlideLockedPartitionUpDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); SlidePartitionUp ( XT, X0, /**/ /**/ X1, XB, X2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void TrsmLUNLarge ( UnitOrNonUnit diag, F alpha, const DistMatrix<F>& U, DistMatrix<F>& X, bool checkIfSingular ) { #ifndef RELEASE PushCallStack("internal::TrsmLUNLarge"); #endif const Grid& g = U.Grid(); // Matrix views DistMatrix<F> UTL(g), UTR(g), U00(g), U01(g), U02(g), UBL(g), UBR(g), U10(g), U11(g), U12(g), U20(g), U21(g), U22(g); DistMatrix<F> XT(g), X0(g), XB(g), X1(g), X2(g); // Temporary distributions DistMatrix<F,MC, STAR> U01_MC_STAR(g); DistMatrix<F,STAR,STAR> U11_STAR_STAR(g); DistMatrix<F,STAR,MR > X1_STAR_MR(g); DistMatrix<F,STAR,VR > X1_STAR_VR(g); // Start the algorithm Scale( alpha, X ); LockedPartitionUpDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); PartitionUp ( X, XT, XB, 0 ); while( XT.Height() > 0 ) { LockedRepartitionUpDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); RepartitionUp ( XT, X0, X1, /**/ /**/ XB, X2 ); U01_MC_STAR.AlignWith( X0 ); X1_STAR_MR.AlignWith( X0 ); //--------------------------------------------------------------------// U11_STAR_STAR = U11; // U11[* ,* ] <- U11[MC,MR] X1_STAR_VR = X1; // X1[* ,VR] <- X1[MC,MR] // X1[* ,VR] := U11^-1[* ,* ] X1[* ,VR] LocalTrsm ( LEFT, UPPER, NORMAL, diag, F(1), U11_STAR_STAR, X1_STAR_VR, checkIfSingular ); X1_STAR_MR = X1_STAR_VR; // X1[* ,MR] <- X1[* ,VR] X1 = X1_STAR_MR; // X1[MC,MR] <- X1[* ,MR] U01_MC_STAR = U01; // U01[MC,* ] <- U01[MC,MR] // X0[MC,MR] -= U01[MC,* ] X1[* ,MR] LocalGemm( NORMAL, NORMAL, F(-1), U01_MC_STAR, X1_STAR_MR, F(1), X0 ); //--------------------------------------------------------------------// U01_MC_STAR.FreeAlignments(); X1_STAR_MR.FreeAlignments(); SlideLockedPartitionUpDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); SlidePartitionUp ( XT, X0, /**/ /**/ X1, XB, X2 ); } #ifndef RELEASE PopCallStack(); #endif }
bool ICBinaryArith_Int32::Compiler::generateStubCode(MacroAssembler& masm) { // Guard that R0 is an integer and R1 is an integer. Label failure; masm.branchTestInt32(Assembler::NotEqual, R0, &failure); masm.branchTestInt32(Assembler::NotEqual, R1, &failure); // Add R0 and R1. Don't need to explicitly unbox, just use R2. Register Rscratch = R2_; ARMRegister Wscratch = ARMRegister(Rscratch, 32); #ifdef MERGE // DIV and MOD need an extra non-volatile ValueOperand to hold R0. AllocatableGeneralRegisterSet savedRegs(availableGeneralRegs(2)); savedRegs.set() = GeneralRegisterSet::Intersect(GeneralRegisterSet::NonVolatile(), savedRegs); #endif // get some more ARM-y names for the registers ARMRegister W0(R0_, 32); ARMRegister X0(R0_, 64); ARMRegister W1(R1_, 32); ARMRegister X1(R1_, 64); ARMRegister WTemp(ExtractTemp0, 32); ARMRegister XTemp(ExtractTemp0, 64); Label maybeNegZero, revertRegister; switch(op_) { case JSOP_ADD: masm.Adds(WTemp, W0, Operand(W1)); // Just jump to failure on overflow. R0 and R1 are preserved, so we can // just jump to the next stub. masm.j(Assembler::Overflow, &failure); // Box the result and return. We know R0 already contains the // integer tag, so we just need to move the payload into place. masm.movePayload(ExtractTemp0, R0_); break; case JSOP_SUB: masm.Subs(WTemp, W0, Operand(W1)); masm.j(Assembler::Overflow, &failure); masm.movePayload(ExtractTemp0, R0_); break; case JSOP_MUL: masm.mul32(R0.valueReg(), R1.valueReg(), Rscratch, &failure, &maybeNegZero); masm.movePayload(Rscratch, R0_); break; case JSOP_DIV: case JSOP_MOD: { // Check for INT_MIN / -1, it results in a double. Label check2; masm.Cmp(W0, Operand(INT_MIN)); masm.B(&check2, Assembler::NotEqual); masm.Cmp(W1, Operand(-1)); masm.j(Assembler::Equal, &failure); masm.bind(&check2); Label no_fail; // Check for both division by zero and 0 / X with X < 0 (results in -0). masm.Cmp(W1, Operand(0)); // If x > 0, then it can't be bad. masm.B(&no_fail, Assembler::GreaterThan); // if x == 0, then ignore any comparison, and force // it to fail, if x < 0 (the only other case) // then do the comparison, and fail if y == 0 masm.Ccmp(W0, Operand(0), vixl::ZFlag, Assembler::NotEqual); masm.B(&failure, Assembler::Equal); masm.bind(&no_fail); masm.Sdiv(Wscratch, W0, W1); // Start calculating the remainder, x - (x / y) * y. masm.mul(WTemp, W1, Wscratch); if (op_ == JSOP_DIV) { // Result is a double if the remainder != 0, which happens // when (x/y)*y != x. masm.branch32(Assembler::NotEqual, R0.valueReg(), ExtractTemp0, &revertRegister); masm.movePayload(Rscratch, R0_); } else { // Calculate the actual mod. Set the condition code, so we can see if it is non-zero. masm.Subs(WTemp, W0, WTemp); // If X % Y == 0 and X < 0, the result is -0. masm.Ccmp(W0, Operand(0), vixl::NoFlag, Assembler::Equal); masm.branch(Assembler::LessThan, &revertRegister); masm.movePayload(ExtractTemp0, R0_); } break; } // ORR, EOR, AND can trivially be coerced int // working without affecting the tag of the dest.. case JSOP_BITOR: masm.Orr(X0, X0, Operand(X1)); break; case JSOP_BITXOR: masm.Eor(X0, X0, Operand(W1, vixl::UXTW)); break; case JSOP_BITAND: masm.And(X0, X0, Operand(X1)); break; // LSH, RSH and URSH can not. case JSOP_LSH: // ARM will happily try to shift by more than 0x1f. masm.Lsl(Wscratch, W0, W1); masm.movePayload(Rscratch, R0.valueReg()); break; case JSOP_RSH: masm.Asr(Wscratch, W0, W1); masm.movePayload(Rscratch, R0.valueReg()); break; case JSOP_URSH: masm.Lsr(Wscratch, W0, W1); if (allowDouble_) { Label toUint; // Testing for negative is equivalent to testing bit 31 masm.Tbnz(Wscratch, 31, &toUint); // Move result and box for return. masm.movePayload(Rscratch, R0_); EmitReturnFromIC(masm); masm.bind(&toUint); masm.convertUInt32ToDouble(Rscratch, ScratchDoubleReg); masm.boxDouble(ScratchDoubleReg, R0, ScratchDoubleReg); } else { // Testing for negative is equivalent to testing bit 31 masm.Tbnz(Wscratch, 31, &failure); // Move result for return. masm.movePayload(Rscratch, R0_); } break; default: MOZ_CRASH("Unhandled op for BinaryArith_Int32."); } EmitReturnFromIC(masm); switch (op_) { case JSOP_MUL: masm.bind(&maybeNegZero); // Result is -0 if exactly one of lhs or rhs is negative. masm.Cmn(W0, W1); masm.j(Assembler::Signed, &failure); // Result is +0, so use the zero register. masm.movePayload(rzr, R0_); EmitReturnFromIC(masm); break; case JSOP_DIV: case JSOP_MOD: masm.bind(&revertRegister); break; default: break; } // Failure case - jump to next stub. masm.bind(&failure); EmitStubGuardFailure(masm); return true; }
inline void TrmmLLTA ( Orientation orientation, UnitOrNonUnit diag, T alpha, const DistMatrix<T>& L, DistMatrix<T>& X ) { #ifndef RELEASE PushCallStack("internal::TrmmLLTA"); if( L.Grid() != X.Grid() ) throw std::logic_error ("L and X must be distributed over the same grid"); if( orientation == NORMAL ) throw std::logic_error ("TrmmLLTA expects a (Conjugate)Transpose option"); if( L.Height() != L.Width() || L.Height() != X.Height() ) { std::ostringstream msg; msg << "Nonconformal TrmmLLTA: \n" << " L ~ " << L.Height() << " x " << L.Width() << "\n" << " X ~ " << X.Height() << " x " << X.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } #endif const Grid& g = L.Grid(); // Matrix views DistMatrix<T> LTL(g), LTR(g), L00(g), L01(g), L02(g), LBL(g), LBR(g), L10(g), L11(g), L12(g), L20(g), L21(g), L22(g); DistMatrix<T> XL(g), XR(g), X0(g), X1(g), X2(g); DistMatrix<T,MC,STAR> X1_MC_STAR(g); DistMatrix<T,MR,STAR> Z1_MR_STAR(g); DistMatrix<T,MR,MC > Z1_MR_MC(g); X1_MC_STAR.AlignWith( L ); Z1_MR_STAR.AlignWith( L ); PartitionRight( X, XL, XR, 0 ); while( XL.Width() < X.Width() ) { RepartitionRight ( XL, /**/ XR, X0, /**/ X1, X2 ); Zeros( X1.Height(), X1.Width(), Z1_MR_STAR ); //--------------------------------------------------------------------// X1_MC_STAR = X1; LocalTrmmAccumulateLLT ( orientation, diag, alpha, L, X1_MC_STAR, Z1_MR_STAR ); Z1_MR_MC.SumScatterFrom( Z1_MR_STAR ); X1 = Z1_MR_MC; //--------------------------------------------------------------------// SlidePartitionRight ( XL, /**/ XR, X0, X1, /**/ X2 ); } #ifndef RELEASE PopCallStack(); #endif }
void benchmark_sort( const vex::Context &ctx, vex::profiler<> &prof ) { const size_t N = 16 * 1024 * 1024; const size_t M = 16; typedef typename std::conditional< std::is_same<float, real>::value, cl_uint, cl_ulong >::type key_type; std::default_random_engine rng( std::rand() ); std::uniform_int_distribution<key_type> rnd; std::vector<key_type> x0(N); std::vector<key_type> x1(N); std::generate(x0.begin(), x0.end(), [&]() { return rnd(rng); }); vex::vector<key_type> X0(ctx, x0); vex::vector<key_type> X1(ctx, N); X1 = X0; vex::sort(X1); double tot_time = 0; for(size_t i = 0; i < M; i++) { X1 = X0; ctx.finish(); prof.tic_cpu("VexCL"); vex::sort(X1); ctx.finish(); tot_time += prof.toc("VexCL"); } std::cout << "Sort (" << vex::type_name<key_type>() << ")\n" << " VexCL: " << N * M / tot_time << " keys/sec\n"; #ifdef HAVE_BOOST_COMPUTE X1 = X0; vex::compute::sort(X1); tot_time = 0; for(size_t i = 0; i < M; i++) { X1 = X0; ctx.finish(); prof.tic_cpu("Boost.Compute"); vex::compute::sort(X1); ctx.finish(); tot_time += prof.toc("Boost.Compute"); } std::cout << " Boost.Compute: " << N * M / tot_time << " keys/sec\n"; #endif #ifdef HAVE_CLOGS X1 = X0; vex::clogs::sort(X1); tot_time = 0; for(size_t i = 0; i < M; i++) { X1 = X0; ctx.finish(); prof.tic_cpu("CLOGS"); vex::clogs::sort(X1); ctx.finish(); tot_time += prof.toc("CLOGS"); } std::cout << " CLOGS: " << N * M / tot_time << " keys/sec\n"; #endif if (options.bm_cpu) { tot_time = 0; for(size_t i = 0; i < M; i++) { std::copy(x0.begin(), x0.end(), x1.begin()); prof.tic_cpu("STL"); std::sort(x1.begin(), x1.end()); tot_time += prof.toc("STL"); } std::cout << " STL: " << N * M / tot_time << " keys/sec\n"; } std::cout << std::endl; }
static void ec_mulm (gcry_mpi_t w, gcry_mpi_t u, gcry_mpi_t v, mpi_ec_t ctx) { #if 0 /* NOTE: This code works only for limb sizes of 32 bit. */ mpi_limb_t *wp, *sp; if (ctx->nist_nbits == 192) { mpi_mul (w, u, v); mpi_resize (w, 12); wp = w->d; sp = ctx->s[0]->d; sp[0*2+0] = wp[0*2+0]; sp[0*2+1] = wp[0*2+1]; sp[1*2+0] = wp[1*2+0]; sp[1*2+1] = wp[1*2+1]; sp[2*2+0] = wp[2*2+0]; sp[2*2+1] = wp[2*2+1]; sp = ctx->s[1]->d; sp[0*2+0] = wp[3*2+0]; sp[0*2+1] = wp[3*2+1]; sp[1*2+0] = wp[3*2+0]; sp[1*2+1] = wp[3*2+1]; sp[2*2+0] = 0; sp[2*2+1] = 0; sp = ctx->s[2]->d; sp[0*2+0] = 0; sp[0*2+1] = 0; sp[1*2+0] = wp[4*2+0]; sp[1*2+1] = wp[4*2+1]; sp[2*2+0] = wp[4*2+0]; sp[2*2+1] = wp[4*2+1]; sp = ctx->s[3]->d; sp[0*2+0] = wp[5*2+0]; sp[0*2+1] = wp[5*2+1]; sp[1*2+0] = wp[5*2+0]; sp[1*2+1] = wp[5*2+1]; sp[2*2+0] = wp[5*2+0]; sp[2*2+1] = wp[5*2+1]; ctx->s[0]->nlimbs = 6; ctx->s[1]->nlimbs = 6; ctx->s[2]->nlimbs = 6; ctx->s[3]->nlimbs = 6; mpi_add (ctx->c, ctx->s[0], ctx->s[1]); mpi_add (ctx->c, ctx->c, ctx->s[2]); mpi_add (ctx->c, ctx->c, ctx->s[3]); while ( mpi_cmp (ctx->c, ctx->p ) >= 0 ) mpi_sub ( ctx->c, ctx->c, ctx->p ); mpi_set (w, ctx->c); } else if (ctx->nist_nbits == 384) { int i; mpi_mul (w, u, v); mpi_resize (w, 24); wp = w->d; #define NEXT(a) do { ctx->s[(a)]->nlimbs = 12; \ sp = ctx->s[(a)]->d; \ i = 0; } while (0) #define X(a) do { sp[i++] = wp[(a)];} while (0) #define X0(a) do { sp[i++] = 0; } while (0) NEXT(0); X(0);X(1);X(2);X(3);X(4);X(5);X(6);X(7);X(8);X(9);X(10);X(11); NEXT(1); X0();X0();X0();X0();X(21);X(22);X(23);X0();X0();X0();X0();X0(); NEXT(2); X(12);X(13);X(14);X(15);X(16);X(17);X(18);X(19);X(20);X(21);X(22);X(23); NEXT(3); X(21);X(22);X(23);X(12);X(13);X(14);X(15);X(16);X(17);X(18);X(19);X(20); NEXT(4); X0();X(23);X0();X(20);X(12);X(13);X(14);X(15);X(16);X(17);X(18);X(19); NEXT(5); X0();X0();X0();X0();X(20);X(21);X(22);X(23);X0();X0();X0();X0(); NEXT(6); X(20);X0();X0();X(21);X(22);X(23);X0();X0();X0();X0();X0();X0(); NEXT(7); X(23);X(12);X(13);X(14);X(15);X(16);X(17);X(18);X(19);X(20);X(21);X(22); NEXT(8); X0();X(20);X(21);X(22);X(23);X0();X0();X0();X0();X0();X0();X0(); NEXT(9); X0();X0();X0();X(23);X(23);X0();X0();X0();X0();X0();X0();X0(); #undef X0 #undef X #undef NEXT mpi_add (ctx->c, ctx->s[0], ctx->s[1]); mpi_add (ctx->c, ctx->c, ctx->s[1]); mpi_add (ctx->c, ctx->c, ctx->s[2]); mpi_add (ctx->c, ctx->c, ctx->s[3]); mpi_add (ctx->c, ctx->c, ctx->s[4]); mpi_add (ctx->c, ctx->c, ctx->s[5]); mpi_add (ctx->c, ctx->c, ctx->s[6]); mpi_sub (ctx->c, ctx->c, ctx->s[7]); mpi_sub (ctx->c, ctx->c, ctx->s[8]); mpi_sub (ctx->c, ctx->c, ctx->s[9]); while ( mpi_cmp (ctx->c, ctx->p ) >= 0 ) mpi_sub ( ctx->c, ctx->c, ctx->p ); while ( ctx->c->sign ) mpi_add ( ctx->c, ctx->c, ctx->p ); mpi_set (w, ctx->c); } else #endif /*0*/ mpi_mulm (w, u, v, ctx->p); }
inline void TrsmLLTLarge ( Orientation orientation, UnitOrNonUnit diag, F alpha, const DistMatrix<F>& L, DistMatrix<F>& X, bool checkIfSingular ) { #ifndef RELEASE PushCallStack("internal::TrsmLLTLarge"); if( orientation == NORMAL ) throw std::logic_error("TrsmLLT expects a (Conjugate)Transpose option"); #endif const Grid& g = L.Grid(); // Matrix views DistMatrix<F> LTL(g), LTR(g), L00(g), L01(g), L02(g), LBL(g), LBR(g), L10(g), L11(g), L12(g), L20(g), L21(g), L22(g); DistMatrix<F> XT(g), X0(g), XB(g), X1(g), X2(g); // Temporary distributions DistMatrix<F,STAR,MC > L10_STAR_MC(g); DistMatrix<F,STAR,STAR> L11_STAR_STAR(g); DistMatrix<F,STAR,MR > X1_STAR_MR(g); DistMatrix<F,STAR,VR > X1_STAR_VR(g); // Start the algorithm Scale( alpha, X ); LockedPartitionUpDiagonal ( L, LTL, LTR, LBL, LBR, 0 ); PartitionUp ( X, XT, XB, 0 ); while( XT.Height() > 0 ) { LockedRepartitionUpDiagonal ( LTL, /**/ LTR, L00, L01, /**/ L02, /**/ L10, L11, /**/ L12, /*************/ /******************/ LBL, /**/ LBR, L20, L21, /**/ L22 ); RepartitionUp ( XT, X0, X1, /**/ /**/ XB, X2 ); L10_STAR_MC.AlignWith( X0 ); X1_STAR_MR.AlignWith( X0 ); //--------------------------------------------------------------------// L11_STAR_STAR = L11; // L11[* ,* ] <- L11[MC,MR] X1_STAR_VR = X1; // X1[* ,VR] <- X1[MC,MR] // X1[* ,VR] := L11^-[T/H][* ,* ] X1[* ,VR] LocalTrsm ( LEFT, LOWER, orientation, diag, F(1), L11_STAR_STAR, X1_STAR_VR, checkIfSingular ); X1_STAR_MR = X1_STAR_VR; // X1[* ,MR] <- X1[* ,VR] X1 = X1_STAR_MR; // X1[MC,MR] <- X1[* ,MR] L10_STAR_MC = L10; // L10[* ,MC] <- L10[MC,MR] // X0[MC,MR] -= (L10[* ,MC])^(T/H) X1[* ,MR] // = L10^[T/H][MC,* ] X1[* ,MR] LocalGemm ( orientation, NORMAL, F(-1), L10_STAR_MC, X1_STAR_MR, F(1), X0 ); //--------------------------------------------------------------------// L10_STAR_MC.FreeAlignments(); X1_STAR_MR.FreeAlignments(); SlideLockedPartitionUpDiagonal ( LTL, /**/ LTR, L00, /**/ L01, L02, /*************/ /******************/ /**/ L10, /**/ L11, L12, LBL, /**/ LBR, L20, /**/ L21, L22 ); SlidePartitionUp ( XT, X0, /**/ /**/ X1, XB, X2 ); } #ifndef RELEASE PopCallStack(); #endif }
void OpticalFlow::baseCalculate(cv::Mat& Im1, cv::Mat& Im2, flowUV& UV, const OpticalFlowParams& params){ int rows = Im1.rows; int cols = Im1.cols; FlowOperator flowOp(rows, cols); FArray X0(2 * rows * cols, false); FArray dUdV(2 * rows * cols, true, 0); cv::Mat Ix1(rows, cols, OPTFLOW_TYPE); cv::Mat Iy1(rows, cols, OPTFLOW_TYPE); cv::Mat Ix(rows, cols, OPTFLOW_TYPE); cv::Mat Iy(rows, cols, OPTFLOW_TYPE); getDXsCV(Im1, Ix1, Iy1); for (int i = 0; i < params.getIters(); ++i){ cv::Mat Ix2(rows, cols, OPTFLOW_TYPE); cv::Mat Iy2(rows, cols, OPTFLOW_TYPE); cv::Mat It(rows, cols, OPTFLOW_TYPE); cv::Mat im2Warpped(rows, cols, Im1.type()); WarpImage(Im2, UV.getU(), UV.getV(), im2Warpped); getDXsCV(im2Warpped, Ix2, Iy2); Ix = params.getWeightedDeriveFactor() * (Ix1 + Ix2); Iy = params.getWeightedDeriveFactor() * (Iy1 + Iy2); cv::subtract(im2Warpped, Im1, It); if (params.getDisplayDerivativs()){ cv::imshow("Derivative Ix", Ix); cv::imshow("Derivative Iy", Iy); cv::waitKey(1); } cv::Mat Du(rows, cols, OPTFLOW_TYPE, cv::Scalar(0)); cv::Mat Dv(rows, cols, OPTFLOW_TYPE, cv::Scalar(0)); for (int j = 0; j < params.getLinearIters(); ++j){ #if OPTFLOW_VERBOSE cout << "solving Ax=b with SOR "; clock_t start = std::clock(); #endif flowOp.construct(UV, Du, Dv, Ix, Iy, It, params); memcpy(X0.ptr, UV.getU().data, rows * cols * sizeof(float)); memcpy(X0.ptr + (rows * cols), UV.getV().data, rows * cols * sizeof(float)); //UtilsDebug::printCRSSparseMat(flowOp.getA(), "aaaa.txt"); if (params.getCheckResidualTolerance()){ LinearSolver::sparseMatSor(flowOp.getA(), X0 ,dUdV, flowOp.getb(), params.getOverRelaxation(), params.getSorIters(), params.getResidualTolerance()); }else{ //LinearSolver::multigrid(10,10,flowOp.getA(),flowOp.getb(), params.getResidualTolerance(), dUdV, 20, 20, LinearSolver::vCycle); LinearSolver::sparseMatSorNoResidual(flowOp.getA(), X0 ,dUdV, flowOp.getb(), params.getOverRelaxation(), params.getSorIters()); } #if OPTFLOW_VERBOSE std::cout<<" --- "<< (std::clock() - start) / (double)CLOCKS_PER_SEC <<'\n'; #endif #if OPTFLOW_DEBUG for(int i = 0; i < dUdV.size(); ++i){ if (!(dUdV.ptr[i] == dUdV.ptr[i])){ cout << "ERROR - NAN"; } } #endif UtilsMat::clamp(dUdV, -1, 1); memcpy(Du.data, dUdV.ptr, rows * cols * sizeof(float)); memcpy(Dv.data, dUdV.ptr + (rows * cols), rows * cols * sizeof(float)); flowUV UV0(UV); UV.getU() += Du; UV.getV() += Dv; cv::Mat tmpU, tmpV; UV.getU().copyTo(tmpU); UV.getV().copyTo(tmpV); WeightedMedianFilter::computeMedianFilter(UV.getU(), UV.getV(), Im1, Im2, params.getMedianFilterRadius()); Du = UV.getU() - UV0.getU(); Dv = UV.getV() - UV0.getV(); UV0.getU().copyTo(UV.getU()); UV0.getV().copyTo(UV.getV()); UV0.getU() += Du; UV0.getV() += Dv; if (params.isDisplay()) UtilsFlow::DrawFlow(UV0.getU(), UV0.getV(), "Flow"); } UV.getU() += Du; UV.getV() += Dv; } }
static int f0_val(int base) { int id = X0(base); return id2val(id); } /* f0_val */
inline void TrsmRUT ( Orientation orientation, UnitOrNonUnit diag, F alpha, const DistMatrix<F>& U, DistMatrix<F>& X, bool checkIfSingular ) { #ifndef RELEASE PushCallStack("internal::TrsmRUT"); if( orientation == NORMAL ) throw std::logic_error("TrsmRUT expects a (Conjugate)Transpose option"); #endif const Grid& g = U.Grid(); // Matrix views DistMatrix<F> UTL(g), UTR(g), U00(g), U01(g), U02(g), UBL(g), UBR(g), U10(g), U11(g), U12(g), U20(g), U21(g), U22(g); DistMatrix<F> XL(g), XR(g), X0(g), X1(g), X2(g); // Temporary distributions DistMatrix<F,VR, STAR> U01_VR_STAR(g); DistMatrix<F,STAR,MR > U01AdjOrTrans_STAR_MR(g); DistMatrix<F,STAR,STAR> U11_STAR_STAR(g); DistMatrix<F,VC, STAR> X1_VC_STAR(g); DistMatrix<F,STAR,MC > X1Trans_STAR_MC(g); // Start the algorithm Scale( alpha, X ); LockedPartitionUpDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); PartitionLeft( X, XL, XR, 0 ); while( XL.Width() > 0 ) { LockedRepartitionUpDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); RepartitionLeft ( XL, /**/ XR, X0, X1, /**/ X2 ); X1_VC_STAR.AlignWith( X0 ); X1Trans_STAR_MC.AlignWith( X0 ); U01_VR_STAR.AlignWith( X0 ); U01AdjOrTrans_STAR_MR.AlignWith( X0 ); //--------------------------------------------------------------------// U11_STAR_STAR = U11; X1_VC_STAR = X1; LocalTrsm ( RIGHT, UPPER, orientation, diag, F(1), U11_STAR_STAR, X1_VC_STAR, checkIfSingular ); X1Trans_STAR_MC.TransposeFrom( X1_VC_STAR ); X1.TransposeFrom( X1Trans_STAR_MC ); U01_VR_STAR = U01; if( orientation == ADJOINT ) U01AdjOrTrans_STAR_MR.AdjointFrom( U01_VR_STAR ); else U01AdjOrTrans_STAR_MR.TransposeFrom( U01_VR_STAR ); // X0[MC,MR] -= X1[MC,* ] (U01[MR,* ])^(T/H) // = X1^T[* ,MC] (U01^(T/H))[* ,MR] LocalGemm ( TRANSPOSE, NORMAL, F(-1), X1Trans_STAR_MC, U01AdjOrTrans_STAR_MR, F(1), X0 ); //--------------------------------------------------------------------// X1_VC_STAR.FreeAlignments(); X1Trans_STAR_MC.FreeAlignments(); U01_VR_STAR.FreeAlignments(); U01AdjOrTrans_STAR_MR.FreeAlignments(); SlideLockedPartitionUpDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); SlidePartitionLeft ( XL, /**/ XR, X0, /**/ X1, X2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void TrmmLLNC ( UnitOrNonUnit diag, T alpha, const DistMatrix<T>& L, DistMatrix<T>& X ) { #ifndef RELEASE CallStackEntry entry("internal::TrmmLLNC"); if( L.Grid() != X.Grid() ) throw std::logic_error ("L and X must be distributed over the same grid"); if( L.Height() != L.Width() || L.Width() != X.Height() ) { std::ostringstream msg; msg << "Nonconformal TrmmLLNC: \n" << " L ~ " << L.Height() << " x " << L.Width() << "\n" << " X ~ " << X.Height() << " x " << X.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } #endif const Grid& g = L.Grid(); // Matrix views DistMatrix<T> LTL(g), LTR(g), L00(g), L01(g), L02(g), LBL(g), LBR(g), L10(g), L11(g), L12(g), L20(g), L21(g), L22(g); DistMatrix<T> XT(g), X0(g), XB(g), X1(g), X2(g); // Temporary distributions DistMatrix<T,MC, STAR> L21_MC_STAR(g); DistMatrix<T,STAR,STAR> L11_STAR_STAR(g); DistMatrix<T,STAR,VR > X1_STAR_VR(g); DistMatrix<T,MR, STAR> X1Trans_MR_STAR(g); // Start the algorithm Scale( alpha, X ); LockedPartitionUpDiagonal ( L, LTL, LTR, LBL, LBR, 0 ); PartitionUp ( X, XT, XB, 0 ); while( XT.Height() > 0 ) { LockedRepartitionUpDiagonal ( LTL, /**/ LTR, L00, L01, /**/ L02, /**/ L10, L11, /**/ L12, /*************/ /******************/ LBL, /**/ LBR, L20, L21, /**/ L22 ); RepartitionUp ( XT, X0, X1, /**/ /**/ XB, X2 ); L21_MC_STAR.AlignWith( X2 ); X1Trans_MR_STAR.AlignWith( X2 ); X1_STAR_VR.AlignWith( X1 ); //--------------------------------------------------------------------// L21_MC_STAR = L21; X1Trans_MR_STAR.TransposeFrom( X1 ); LocalGemm ( NORMAL, TRANSPOSE, T(1), L21_MC_STAR, X1Trans_MR_STAR, T(1), X2 ); L11_STAR_STAR = L11; X1_STAR_VR.TransposeFrom( X1Trans_MR_STAR ); LocalTrmm( LEFT, LOWER, NORMAL, diag, T(1), L11_STAR_STAR, X1_STAR_VR ); X1 = X1_STAR_VR; //--------------------------------------------------------------------// L21_MC_STAR.FreeAlignments(); X1Trans_MR_STAR.FreeAlignments(); X1_STAR_VR.FreeAlignments(); SlideLockedPartitionUpDiagonal ( LTL, /**/ LTR, L00, /**/ L01, L02, /*************/ /******************/ /**/ L10, /**/ L11, L12, LBL, /**/ LBR, L20, /**/ L21, L22 ); SlidePartitionUp ( XT, X0, /**/ /**/ X1, XB, X2 ); } }
inline void TrmmRUNC ( UnitOrNonUnit diag, T alpha, const DistMatrix<T>& U, DistMatrix<T>& X ) { #ifndef RELEASE CallStackEntry entry("internal::TrmmRUNC"); if( U.Grid() != X.Grid() ) throw std::logic_error ("U and X must be distributed over the same grid"); if( U.Height() != U.Width() || X.Width() != U.Height() ) { std::ostringstream msg; msg << "Nonconformal TrmmRUNC: \n" << " U ~ " << U.Height() << " x " << U.Width() << "\n" << " X ~ " << X.Height() << " x " << X.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } #endif const Grid& g = U.Grid(); // Matrix views DistMatrix<T> UTL(g), UTR(g), U00(g), U01(g), U02(g), UBL(g), UBR(g), U10(g), U11(g), U12(g), U20(g), U21(g), U22(g); DistMatrix<T> XL(g), XR(g), X0(g), X1(g), X2(g); // Temporary distributions DistMatrix<T,MR, STAR> U12Trans_MR_STAR(g); DistMatrix<T,STAR,STAR> U11_STAR_STAR(g); DistMatrix<T,VC, STAR> X1_VC_STAR(g); DistMatrix<T,MC, STAR> X1_MC_STAR(g); // Start the algorithm Scale( alpha, X ); LockedPartitionUpDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); PartitionLeft( X, XL, XR, 0 ); while( XL.Width() > 0 ) { LockedRepartitionUpDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); RepartitionLeft ( XL, /**/ XR, X0, X1, /**/ X2 ); X1_MC_STAR.AlignWith( X2 ); U12Trans_MR_STAR.AlignWith( X2 ); X1_VC_STAR.AlignWith( X1 ); //--------------------------------------------------------------------// X1_MC_STAR = X1; U12Trans_MR_STAR.TransposeFrom( U12 ); LocalGemm ( NORMAL, TRANSPOSE, T(1), X1_MC_STAR, U12Trans_MR_STAR, T(1), X2 ); U11_STAR_STAR = U11; X1_VC_STAR = X1_MC_STAR; LocalTrmm ( RIGHT, UPPER, NORMAL, diag, T(1), U11_STAR_STAR, X1_VC_STAR ); X1 = X1_VC_STAR; //--------------------------------------------------------------------// X1_MC_STAR.FreeAlignments(); U12Trans_MR_STAR.FreeAlignments(); X1_VC_STAR.FreeAlignments(); SlideLockedPartitionUpDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); SlidePartitionLeft ( XL, /**/ XR, X0, /**/ X1, X2 ); } }
inline void TrsmLLTSmall ( Orientation orientation, UnitOrNonUnit diag, F alpha, const DistMatrix<F,STAR,VR>& L, DistMatrix<F,VR,STAR>& X, bool checkIfSingular ) { #ifndef RELEASE PushCallStack("internal::TrsmLLTSmall"); if( L.Grid() != X.Grid() ) throw std::logic_error ("L and X must be distributed over the same grid"); if( orientation == NORMAL ) throw std::logic_error("TrsmLLT expects a (Conjugate)Transpose option"); if( L.Height() != L.Width() || L.Height() != X.Height() ) { std::ostringstream msg; msg << "Nonconformal TrsmLLT: \n" << " L ~ " << L.Height() << " x " << L.Width() << "\n" << " X ~ " << X.Height() << " x " << X.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } if( L.RowAlignment() != X.ColAlignment() ) throw std::logic_error("L and X must be aligned"); #endif const Grid& g = L.Grid(); // Matrix views DistMatrix<F,STAR,VR> LTL(g), LTR(g), L00(g), L01(g), L02(g), LBL(g), LBR(g), L10(g), L11(g), L12(g), L20(g), L21(g), L22(g); DistMatrix<F,VR,STAR> XT(g), X0(g), XB(g), X1(g), X2(g); // Temporary distributions DistMatrix<F,STAR,STAR> L11_STAR_STAR(g); DistMatrix<F,STAR,STAR> X1_STAR_STAR(g); // Start the algorithm Scale( alpha, X ); LockedPartitionUpDiagonal ( L, LTL, LTR, LBL, LBR, 0 ); PartitionUp ( X, XT, XB, 0 ); while( XT.Height() > 0 ) { LockedRepartitionUpDiagonal ( LTL, /**/ LTR, L00, L01, /**/ L02, /**/ L10, L11, /**/ L12, /*************/ /******************/ LBL, /**/ LBR, L20, L21, /**/ L22 ); RepartitionUp ( XT, X0, X1, /**/ /**/ XB, X2 ); //--------------------------------------------------------------------// L11_STAR_STAR = L11; // L11[* ,* ] <- L11[* ,VR] X1_STAR_STAR = X1; // X1[* ,* ] <- X1[VR,* ] // X1[* ,* ] := L11^-[T/H][* ,* ] X1[* ,* ] LocalTrsm ( LEFT, LOWER, orientation, diag, F(1), L11_STAR_STAR, X1_STAR_STAR, checkIfSingular ); X1 = X1_STAR_STAR; // X0[VR,* ] -= L10[* ,VR]^(T/H) X1[* ,* ] LocalGemm( orientation, NORMAL, F(-1), L10, X1_STAR_STAR, F(1), X0 ); //--------------------------------------------------------------------// SlideLockedPartitionUpDiagonal ( LTL, /**/ LTR, L00, /**/ L01, L02, /*************/ /******************/ /**/ L10, /**/ L11, L12, LBL, /**/ LBR, L20, /**/ L21, L22 ); SlidePartitionUp ( XT, X0, /**/ /**/ X1, XB, X2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void TrmmRLNCOld ( UnitOrNonUnit diag, T alpha, const DistMatrix<T>& L, DistMatrix<T>& X ) { #ifndef RELEASE PushCallStack("internal::TrmmRLNCOld"); if( L.Grid() != X.Grid() ) throw std::logic_error ("L and X must be distributed over the same grid"); if( L.Height() != L.Width() || X.Width() != L.Height() ) { std::ostringstream msg; msg << "Nonconformal TrmmRLNC: \n" << " L ~ " << L.Height() << " x " << L.Width() << "\n" << " X ~ " << X.Height() << " x " << X.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } #endif const Grid& g = L.Grid(); // Matrix views DistMatrix<T> LTL(g), LTR(g), L00(g), L01(g), L02(g), LBL(g), LBR(g), L10(g), L11(g), L12(g), L20(g), L21(g), L22(g); DistMatrix<T> XL(g), XR(g), X0(g), X1(g), X2(g); // Temporary distributions DistMatrix<T,STAR,STAR> L11_STAR_STAR(g); DistMatrix<T,MR, STAR> L21_MR_STAR(g); DistMatrix<T,VC, STAR> X1_VC_STAR(g); DistMatrix<T,MC, STAR> D1_MC_STAR(g); // Start the algorithm Scale( alpha, X ); LockedPartitionDownDiagonal ( L, LTL, LTR, LBL, LBR, 0 ); PartitionRight( X, XL, XR, 0 ); while( XR.Width() > 0 ) { LockedRepartitionDownDiagonal ( LTL, /**/ LTR, L00, /**/ L01, L02, /*************/ /******************/ /**/ L10, /**/ L11, L12, LBL, /**/ LBR, L20, /**/ L21, L22 ); RepartitionRight ( XL, /**/ XR, X0, /**/ X1, X2 ); L21_MR_STAR.AlignWith( X2 ); D1_MC_STAR.AlignWith( X1 ); Zeros( X1.Height(), X1.Width(), D1_MC_STAR ); //--------------------------------------------------------------------// X1_VC_STAR = X1; L11_STAR_STAR = L11; LocalTrmm ( RIGHT, LOWER, NORMAL, diag, T(1), L11_STAR_STAR, X1_VC_STAR ); X1 = X1_VC_STAR; L21_MR_STAR = L21; LocalGemm( NORMAL, NORMAL, T(1), X2, L21_MR_STAR, T(0), D1_MC_STAR ); X1.SumScatterUpdate( T(1), D1_MC_STAR ); //--------------------------------------------------------------------// L21_MR_STAR.FreeAlignments(); D1_MC_STAR.FreeAlignments(); SlideLockedPartitionDownDiagonal ( LTL, /**/ LTR, L00, L01, /**/ L02, /**/ L10, L11, /**/ L12, /*************/ /******************/ LBL, /**/ LBR, L20, L21, /**/ L22 ); SlidePartitionRight ( XL, /**/ XR, X0, X1, /**/ X2 ); } #ifndef RELEASE PopCallStack(); #endif }
void StereogramWidget::paintEvent(QPaintEvent* event) { m_radius = 0; //means that nothing has been drawn (yet ;) QLabel::paintEvent(event); QPainter painter(this); painter.setRenderHints(QPainter::Antialiasing | QPainter::TextAntialiasing, true); //pen QPen pen; pen.setStyle(Qt::SolidLine); pen.setBrush(QColor(Qt::black)); int diameter = std::min(width(),height()); int halfW = width()/2; int halfH = height()/2; QPoint center(halfW,halfH); int hsvThickness = 0; if (m_showHSVRing) { int newDiameter = static_cast<int>(ceil(0.9*static_cast<double>(diameter))); hsvThickness = diameter - newDiameter; //TODO if (hsvThickness > 0) { QRect rectangle(center.x()-diameter/2+1,center.y()-diameter/2+1,diameter-2,diameter-2); int angle_span = static_cast<int>(m_angularStep_deg * 16.0); //see QPainter::drawPie QBrush brush; brush.setStyle(Qt::SolidPattern); painter.setPen(Qt::NoPen); //dip direction steps (dip dir. in [0,360]) unsigned ddSteps = static_cast<unsigned>(ceil(360.0 / std::max(m_angularStep_deg,1.0))); for (unsigned j=0; j<ddSteps; ++j) { double dipDir_deg = static_cast<double>(j) * m_angularStep_deg; //set family color ccColor::Rgb col; FacetsClassifier::GenerateSubfamilyColor(col, 90.0, dipDir_deg + 0.5 * m_angularStep_deg, 0, 1); brush.setColor(QColor( static_cast<int>(col.r), static_cast<int>(col.g), static_cast<int>(col.b), 255)); painter.setBrush(brush); int angle_start = static_cast<int>((360.0 - dipDir_deg - m_angularStep_deg + 90.0) * 16.0); //see QPainter::drawPie painter.drawPie(rectangle,angle_start,angle_span); } } diameter = newDiameter; } //outer circle pen.setWidth(2); painter.setPen(pen); painter.setBrush(Qt::white); int radius = diameter/2 - 2; painter.drawEllipse(center,radius,radius); painter.setBrush(Qt::NoBrush); //keep track of the circle position m_radius = radius; m_center = center; //main axes painter.drawLine(center-QPoint(radius,0),center+QPoint(radius,0)); painter.drawLine(center-QPoint(0,radius),center+QPoint(0,radius)); //draw circles if (m_angularStep_deg > 0) { //dip steps (dip in [0,90]) unsigned dSteps = static_cast<unsigned>(ceil(90.0 / m_angularStep_deg)); //dip direction steps (dip dir. in [0,360]) unsigned ddSteps = static_cast<unsigned>(ceil(360.0 / m_angularStep_deg)); //draw inner circles pen.setWidth(1); pen.setColor(Qt::gray); painter.setPen(pen); for (unsigned i=1; i<dSteps; ++i) { double dip_deg = static_cast<double>(i) * m_angularStep_deg; if (dip_deg < 90.0) { int R = static_cast<int>(static_cast<double>(radius) * (dip_deg/90.0)); if (R > 1) painter.drawEllipse(center,R-1,R-1); } } //draw rays (+ 'm_ticksFreq' times more ticks) int ticksFreq = std::max(m_ticksFreq,1); for (unsigned j=1; j<=ddSteps*ticksFreq; ++j) { double dipDir_deg = static_cast<double>(j) * m_angularStep_deg / static_cast<double>(ticksFreq); if (dipDir_deg < 360.0) { QPoint X( static_cast<int>(sin(dipDir_deg * CC_DEG_TO_RAD) * static_cast<double>(radius)), -static_cast<int>(cos(dipDir_deg * CC_DEG_TO_RAD) * static_cast<double>(radius)) ); if ((j % ticksFreq) == 0) //long ticks painter.drawLine(center,center+X); else painter.drawLine(center+X*0.93,center+X); } } } //draw density map if (m_densityGrid && m_densityColorScale && m_densityGrid->grid && m_densityGrid->minMaxDensity[1] != 0) { assert(m_densityColorScale); assert(m_densityGrid->grid); QBrush brush; brush.setStyle(Qt::SolidPattern); painter.setPen(Qt::NoPen); QPolygon poly(4); const double* d = m_densityGrid->grid; for (unsigned j=0; j<m_densityGrid->ddSteps; ++j) { double dipDir0_rad = static_cast<double>(j) * m_densityGrid->step_deg * CC_DEG_TO_RAD; double dipDir1_rad = static_cast<double>(j+1) * m_densityGrid->step_deg * CC_DEG_TO_RAD; double cos_dipDir0 = cos(dipDir0_rad); double sin_dipDir0 = sin(dipDir0_rad); double cos_dipDir1 = cos(dipDir1_rad); double sin_dipDir1 = sin(dipDir1_rad); for (unsigned i=0; i<m_densityGrid->rSteps; ++i, ++d) { if (*d != 0) { double relPos = static_cast<double>(*d)/static_cast<double>(m_densityGrid->minMaxDensity[1]); const colorType* col = m_densityColorScale->getColorByRelativePos(relPos,m_densityColorScaleSteps); brush.setColor(QColor( static_cast<int>(col[0]), static_cast<int>(col[1]), static_cast<int>(col[2]), 255)); painter.setBrush(brush); //stereographic projection //double dip0_rad = static_cast<double>(i) * m_densityGrid->step_deg * CC_DEG_TO_RAD; //double dip1_rad = static_cast<double>(i+1) * m_densityGrid->step_deg * CC_DEG_TO_RAD; //double R0 = static_cast<double>(radius) * cos(dip0_rad) / (1.0 + sin(dip0_rad)); //double R1 = static_cast<double>(radius) * cos(dip1_rad) / (1.0 + sin(dip1_rad)); double R0 = static_cast<double>(radius) * static_cast<double>(i) * m_densityGrid->step_R; double R1 = static_cast<double>(radius) * static_cast<double>(i+1) * m_densityGrid->step_R; poly.setPoint(0,center+QPoint(static_cast<int>(sin_dipDir0 * R0),-static_cast<int>(cos_dipDir0 * R0))); poly.setPoint(1,center+QPoint(static_cast<int>(sin_dipDir0 * R1),-static_cast<int>(cos_dipDir0 * R1))); poly.setPoint(2,center+QPoint(static_cast<int>(sin_dipDir1 * R1),-static_cast<int>(cos_dipDir1 * R1))); poly.setPoint(3,center+QPoint(static_cast<int>(sin_dipDir1 * R0),-static_cast<int>(cos_dipDir1 * R0))); painter.drawPolygon(poly); } } } } //draw main 'dip direction' if (m_meanDipDir_deg >= 0) { pen.setWidth(2); pen.setColor(Qt::red); painter.setPen(pen); //draw main direction QPoint X( static_cast<int>(sin(m_meanDipDir_deg * CC_DEG_TO_RAD) * static_cast<double>(radius)), -static_cast<int>(cos(m_meanDipDir_deg * CC_DEG_TO_RAD) * static_cast<double>(radius)) ); pen.setStyle(Qt::DashLine); painter.setPen(pen); painter.drawLine(center,center+X); //draw orthogonal to main direction QPoint Y( static_cast<int>(cos(m_meanDipDir_deg * CC_DEG_TO_RAD) * static_cast<double>(radius)), static_cast<int>(sin(m_meanDipDir_deg * CC_DEG_TO_RAD) * static_cast<double>(radius)) ); pen.setStyle(Qt::SolidLine); painter.setPen(pen); painter.drawLine(center-Y,center+Y); } //draw filter window around last cliked point if (m_trackMouseClick) { pen.setWidth(2); pen.setColor(Qt::magenta); painter.setPen(pen); //QBrush brush; //brush.setStyle(Qt::Dense6Pattern); //brush.setColor(Qt::red); //painter.setBrush(brush); painter.setBrush(Qt::NoBrush); double R0 = static_cast<double>(radius) * (std::max(0.0,m_clickDip_deg-m_clickDipSpan_deg/2)/90.0); double R1 = static_cast<double>(radius) * (std::min(90.0,m_clickDip_deg+m_clickDipSpan_deg/2)/90.0); //draw radial limits { QPoint X0( static_cast<int>(sin((m_clickDipDir_deg-m_clickDipDirSpan_deg/2) * CC_DEG_TO_RAD) * R0), -static_cast<int>(cos((m_clickDipDir_deg-m_clickDipDirSpan_deg/2) * CC_DEG_TO_RAD) * R0) ); QPoint X1( static_cast<int>(sin((m_clickDipDir_deg-m_clickDipDirSpan_deg/2) * CC_DEG_TO_RAD) * R1), -static_cast<int>(cos((m_clickDipDir_deg-m_clickDipDirSpan_deg/2) * CC_DEG_TO_RAD) * R1) ); painter.drawLine(center+X0,center+X1); } { QPoint X0( static_cast<int>(sin((m_clickDipDir_deg+m_clickDipDirSpan_deg/2) * CC_DEG_TO_RAD) * R0), -static_cast<int>(cos((m_clickDipDir_deg+m_clickDipDirSpan_deg/2) * CC_DEG_TO_RAD) * R0) ); QPoint X1( static_cast<int>(sin((m_clickDipDir_deg+m_clickDipDirSpan_deg/2) * CC_DEG_TO_RAD) * R1), -static_cast<int>(cos((m_clickDipDir_deg+m_clickDipDirSpan_deg/2) * CC_DEG_TO_RAD) * R1) ); painter.drawLine(center+X0,center+X1); } //draw concentric limits { int angle_start = static_cast<int>((360.0 - m_clickDipDir_deg - m_clickDipDirSpan_deg/2 + 90.0) * 16.0); //see QPainter::drawPie int angle_span = static_cast<int>(m_clickDipDirSpan_deg * 16.0); //see QPainter::drawPie QRectF rect0(static_cast<double>(center.x()) - R0, static_cast<double>(center.y()) - R0, 2*R0, 2*R0); painter.drawArc(rect0,angle_start,angle_span); QRectF rect1(static_cast<double>(center.x()) - R1, static_cast<double>(center.y()) - R1, 2*R1, 2*R1); painter.drawArc(rect1,angle_start,angle_span); } } }
inline void TrsmRUN ( UnitOrNonUnit diag, F alpha, const DistMatrix<F>& U, DistMatrix<F>& X, bool checkIfSingular ) { #ifndef RELEASE PushCallStack("internal::TrsmRUN"); #endif const Grid& g = U.Grid(); // Matrix views DistMatrix<F> UTL(g), UTR(g), U00(g), U01(g), U02(g), UBL(g), UBR(g), U10(g), U11(g), U12(g), U20(g), U21(g), U22(g); DistMatrix<F> XL(g), XR(g), X0(g), X1(g), X2(g); // Temporary distributions DistMatrix<F,STAR,STAR> U11_STAR_STAR(g); DistMatrix<F,STAR,MR > U12_STAR_MR(g); DistMatrix<F,VC, STAR> X1_VC_STAR(g); DistMatrix<F,STAR,MC > X1Trans_STAR_MC(g); // Start the algorithm Scale( alpha, X ); LockedPartitionDownDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); PartitionRight( X, XL, XR, 0 ); while( XR.Width() > 0 ) { LockedRepartitionDownDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); RepartitionRight ( XL, /**/ XR, X0, /**/ X1, X2 ); X1_VC_STAR.AlignWith( X2 ); X1Trans_STAR_MC.AlignWith( X2 ); U12_STAR_MR.AlignWith( X2 ); //--------------------------------------------------------------------// U11_STAR_STAR = U11; X1_VC_STAR = X1; LocalTrsm ( RIGHT, UPPER, NORMAL, diag, F(1), U11_STAR_STAR, X1_VC_STAR, checkIfSingular ); X1Trans_STAR_MC.TransposeFrom( X1_VC_STAR ); X1.TransposeFrom( X1Trans_STAR_MC ); U12_STAR_MR = U12; // X2[MC,MR] -= X1[MC,* ] U12[* ,MR] // = X1^T[* ,MC] U12[* ,MR] LocalGemm ( TRANSPOSE, NORMAL, F(-1), X1Trans_STAR_MC, U12_STAR_MR, F(1), X2 ); //--------------------------------------------------------------------// X1_VC_STAR.FreeAlignments(); X1Trans_STAR_MC.FreeAlignments(); U12_STAR_MR.FreeAlignments(); SlideLockedPartitionDownDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); SlidePartitionRight ( XL, /**/ XR, X0, X1, /**/ X2 ); } #ifndef RELEASE PopCallStack(); #endif }
void benchmark_scan( const vex::Context &ctx, vex::profiler<> &prof ) { const size_t N = 16 * 1024 * 1024; const size_t M = 16; typedef typename std::conditional< std::is_same<float, real>::value, cl_uint, cl_ulong >::type key_type; std::default_random_engine rng( std::rand() ); std::uniform_int_distribution<key_type> rnd; std::vector<key_type> x0(N); std::vector<key_type> x1(N); std::generate(x0.begin(), x0.end(), [&]() { return rnd(rng); }); vex::vector<key_type> X0(ctx, x0); vex::vector<key_type> X1(ctx, N); vex::exclusive_scan(X0, X1); ctx.finish(); prof.tic_cpu("VexCL"); for(size_t i = 0; i < M; i++) vex::exclusive_scan(X0, X1); ctx.finish(); double tot_time = prof.toc("VexCL"); std::cout << "Scan (" << vex::type_name<key_type>() << ")\n" << " VexCL: " << N * M / tot_time << " keys/sec\n"; #ifdef HAVE_BOOST_COMPUTE vex::compute::exclusive_scan(X0, X1); ctx.finish(); prof.tic_cpu("Boost.Compute"); for(size_t i = 0; i < M; i++) vex::compute::exclusive_scan(X0, X1); ctx.finish(); tot_time = prof.toc("Boost.Compute"); std::cout << " Boost.Compute: " << N * M / tot_time << " keys/sec\n"; #endif #ifdef HAVE_CLOGS vex::clogs::exclusive_scan(X0, X1); ctx.finish(); prof.tic_cpu("CLOGS"); for(size_t i = 0; i < M; i++) vex::clogs::exclusive_scan(X0, X1); ctx.finish(); tot_time = prof.toc("CLOGS"); std::cout << " CLOGS: " << N * M / tot_time << " keys/sec\n"; #endif if (options.bm_cpu) { prof.tic_cpu("CPU"); for(size_t i = 0; i < M; i++) { key_type sum = key_type(); for(size_t j = 0; j < N; ++j) { key_type next = sum + x0[j]; x1[j] = sum; sum = next; } } tot_time = prof.toc("CPU"); std::cout << " CPU: " << N * M / tot_time << " keys/sec\n"; } std::cout << std::endl; }
inline void TrsmRLT ( Orientation orientation, UnitOrNonUnit diag, F alpha, const DistMatrix<F>& L, DistMatrix<F>& X, bool checkIfSingular ) { #ifndef RELEASE CallStackEntry entry("internal::TrsmRLT"); if( orientation == NORMAL ) LogicError("TrsmRLT expects a (Conjugate)Transpose option"); #endif const Grid& g = L.Grid(); // Matrix views DistMatrix<F> LTL(g), LTR(g), L00(g), L01(g), L02(g), LBL(g), LBR(g), L10(g), L11(g), L12(g), L20(g), L21(g), L22(g); DistMatrix<F> XL(g), XR(g), X0(g), X1(g), X2(g); // Temporary distributions DistMatrix<F,STAR,STAR> L11_STAR_STAR(g); DistMatrix<F,VR, STAR> L21_VR_STAR(g); DistMatrix<F,STAR,MR > L21AdjOrTrans_STAR_MR(g); DistMatrix<F,VC, STAR> X1_VC_STAR(g); DistMatrix<F,STAR,MC > X1Trans_STAR_MC(g); // Start the algorithm Scale( alpha, X ); LockedPartitionDownDiagonal ( L, LTL, LTR, LBL, LBR, 0 ); PartitionRight( X, XL, XR, 0 ); while( XR.Width() > 0 ) { LockedRepartitionDownDiagonal ( LTL, /**/ LTR, L00, /**/ L01, L02, /*************/ /******************/ /**/ L10, /**/ L11, L12, LBL, /**/ LBR, L20, /**/ L21, L22 ); RepartitionRight ( XL, /**/ XR, X0, /**/ X1, X2 ); X1_VC_STAR.AlignWith( X2 ); X1Trans_STAR_MC.AlignWith( X2 ); L21_VR_STAR.AlignWith( X2 ); L21AdjOrTrans_STAR_MR.AlignWith( X2 ); //--------------------------------------------------------------------// L11_STAR_STAR = L11; X1_VC_STAR = X1; LocalTrsm ( RIGHT, LOWER, orientation, diag, F(1), L11_STAR_STAR, X1_VC_STAR, checkIfSingular ); X1Trans_STAR_MC.TransposeFrom( X1_VC_STAR ); X1.TransposeFrom( X1Trans_STAR_MC ); L21_VR_STAR = L21; if( orientation == ADJOINT ) L21AdjOrTrans_STAR_MR.AdjointFrom( L21_VR_STAR ); else L21AdjOrTrans_STAR_MR.TransposeFrom( L21_VR_STAR ); // X2[MC,MR] -= X1[MC,*] (L21[MR,*])^(T/H) // = X1^T[* ,MC] (L21^(T/H))[*,MR] LocalGemm ( TRANSPOSE, NORMAL, F(-1), X1Trans_STAR_MC, L21AdjOrTrans_STAR_MR, F(1), X2 ); //--------------------------------------------------------------------// SlideLockedPartitionDownDiagonal ( LTL, /**/ LTR, L00, L01, /**/ L02, /**/ L10, L11, /**/ L12, /*************/ /******************/ LBL, /**/ LBR, L20, L21, /**/ L22 ); SlidePartitionRight ( XL, /**/ XR, X0, X1, /**/ X2 ); } }
inline void TrmmLLTCOld ( Orientation orientation, UnitOrNonUnit diag, T alpha, const DistMatrix<T>& L, DistMatrix<T>& X ) { #ifndef RELEASE PushCallStack("internal::TrmmLLTCOld"); if( L.Grid() != X.Grid() ) throw std::logic_error ("L and X must be distributed over the same grid"); if( orientation == NORMAL ) throw std::logic_error("TrmmLLT expects a (Conjugate)Transpose option"); if( L.Height() != L.Width() || L.Height() != X.Height() ) { std::ostringstream msg; msg << "Nonconformal TrmmLLTC: \n" << " L ~ " << L.Height() << " x " << L.Width() << "\n" << " X ~ " << X.Height() << " x " << X.Width() << "\n"; throw std::logic_error( msg.str().c_str() ); } #endif const Grid& g = L.Grid(); // Matrix views DistMatrix<T> LTL(g), LTR(g), L00(g), L01(g), L02(g), LBL(g), LBR(g), L10(g), L11(g), L12(g), L20(g), L21(g), L22(g); DistMatrix<T> XT(g), X0(g), XB(g), X1(g), X2(g); // Temporary distributions DistMatrix<T,STAR,STAR> L11_STAR_STAR(g); DistMatrix<T,MC, STAR> L21_MC_STAR(g); DistMatrix<T,STAR,VR > X1_STAR_VR(g); DistMatrix<T,MR, STAR> D1AdjOrTrans_MR_STAR(g); DistMatrix<T,MR, MC > D1AdjOrTrans_MR_MC(g); DistMatrix<T,MC, MR > D1(g); // Start the algorithm Scale( alpha, X ); LockedPartitionDownDiagonal ( L, LTL, LTR, LBL, LBR, 0 ); PartitionDown ( X, XT, XB, 0 ); while( XB.Height() > 0 ) { LockedRepartitionDownDiagonal ( LTL, /**/ LTR, L00, /**/ L01, L02, /*************/ /******************/ /**/ L10, /**/ L11, L12, LBL, /**/ LBR, L20, /**/ L21, L22 ); RepartitionDown ( XT, X0, /**/ /**/ X1, XB, X2 ); L21_MC_STAR.AlignWith( X2 ); D1AdjOrTrans_MR_STAR.AlignWith( X1 ); D1AdjOrTrans_MR_MC.AlignWith( X1 ); D1.AlignWith( X1 ); Zeros( X1.Width(), X1.Height(), D1AdjOrTrans_MR_STAR ); Zeros( X1.Height(), X1.Width(), D1 ); //--------------------------------------------------------------------// X1_STAR_VR = X1; L11_STAR_STAR = L11; LocalTrmm ( LEFT, LOWER, orientation, diag, T(1), L11_STAR_STAR, X1_STAR_VR ); X1 = X1_STAR_VR; L21_MC_STAR = L21; LocalGemm ( orientation, NORMAL, T(1), X2, L21_MC_STAR, T(0), D1AdjOrTrans_MR_STAR ); D1AdjOrTrans_MR_MC.SumScatterFrom( D1AdjOrTrans_MR_STAR ); if( orientation == TRANSPOSE ) Transpose( D1AdjOrTrans_MR_MC.LocalMatrix(), D1.LocalMatrix() ); else Adjoint( D1AdjOrTrans_MR_MC.LocalMatrix(), D1.LocalMatrix() ); Axpy( T(1), D1, X1 ); //--------------------------------------------------------------------// D1.FreeAlignments(); D1AdjOrTrans_MR_MC.FreeAlignments(); D1AdjOrTrans_MR_STAR.FreeAlignments(); L21_MC_STAR.FreeAlignments(); SlideLockedPartitionDownDiagonal ( LTL, /**/ LTR, L00, L01, /**/ L02, /**/ L10, L11, /**/ L12, /*************/ /******************/ LBL, /**/ LBR, L20, L21, /**/ L22 ); SlidePartitionDown ( XT, X0, X1, /**/ /**/ XB, X2 ); } #ifndef RELEASE PopCallStack(); #endif }
inline void TrsmLUNMedium ( UnitOrNonUnit diag, F alpha, const DistMatrix<F>& U, DistMatrix<F>& X, bool checkIfSingular ) { #ifndef RELEASE CallStackEntry entry("internal::TrsmLUNMedium"); #endif const Grid& g = U.Grid(); // Matrix views DistMatrix<F> UTL(g), UTR(g), U00(g), U01(g), U02(g), UBL(g), UBR(g), U10(g), U11(g), U12(g), U20(g), U21(g), U22(g); DistMatrix<F> XT(g), X0(g), XB(g), X1(g), X2(g); // Temporary distributions DistMatrix<F,MC, STAR> U01_MC_STAR(g); DistMatrix<F,STAR,STAR> U11_STAR_STAR(g); DistMatrix<F,MR, STAR> X1Trans_MR_STAR(g); // Start the algorithm Scale( alpha, X ); LockedPartitionUpDiagonal ( U, UTL, UTR, UBL, UBR, 0 ); PartitionUp ( X, XT, XB, 0 ); while( XT.Height() > 0 ) { LockedRepartitionUpDiagonal ( UTL, /**/ UTR, U00, U01, /**/ U02, /**/ U10, U11, /**/ U12, /*************/ /******************/ UBL, /**/ UBR, U20, U21, /**/ U22 ); RepartitionUp ( XT, X0, X1, /**/ /**/ XB, X2 ); U01_MC_STAR.AlignWith( X0 ); X1Trans_MR_STAR.AlignWith( X0 ); //--------------------------------------------------------------------// U11_STAR_STAR = U11; // U11[* ,* ] <- U11[MC,MR] X1Trans_MR_STAR.TransposeFrom( X1 ); // X1[* ,MR] <- X1[MC,MR] // X1[* ,MR] := U11^-1[* ,* ] X1[* ,MR] // // X1^T[MR,* ] := X1^T[MR,* ] U11^-T[* ,* ] LocalTrsm ( RIGHT, UPPER, TRANSPOSE, diag, F(1), U11_STAR_STAR, X1Trans_MR_STAR, checkIfSingular ); X1.TransposeFrom( X1Trans_MR_STAR ); U01_MC_STAR = U01; // U01[MC,* ] <- U01[MC,MR] // X0[MC,MR] -= U01[MC,* ] X1[* ,MR] LocalGemm ( NORMAL, TRANSPOSE, F(-1), U01_MC_STAR, X1Trans_MR_STAR, F(1), X0 ); //--------------------------------------------------------------------// U01_MC_STAR.FreeAlignments(); X1Trans_MR_STAR.FreeAlignments(); SlideLockedPartitionUpDiagonal ( UTL, /**/ UTR, U00, /**/ U01, U02, /*************/ /******************/ /**/ U10, /**/ U11, U12, UBL, /**/ UBR, U20, /**/ U21, U22 ); SlidePartitionUp ( XT, X0, /**/ /**/ X1, XB, X2 ); } }