int main( void ) { Fbw(init); Ap(init); while (1) { if (sys_time_periodic()) { Fbw(periodic_task); Ap(periodic_task); } Fbw(event_task); Ap(event_task); } return 0; }
/* * PPRZ/AP thread * * Call PPRZ AP periodic and event functions */ static void thd_ap(void *arg) { (void) arg; chRegSetThreadName("AP"); while (!chThdShouldTerminateX()) { Ap(handle_periodic_tasks); Ap(event_task); chThdSleepMicroseconds(500); } chThdExit(0); }
int main(int argc, char** argv) { //------------------------------------------------------------------------------ // Assimilando dados do arquivo externo //------------------------------------------------------------------------------ Volumes v1; LeituraDadosProblema (v1); std :: vector <Real> xFronteiras (v1.NVOL()+1), //localizações fronteiras xCentro (v1.NVOL()), //localizações centros DistCentro (v1.NVOL()+1), //distâncias entre centros DistFace (v1.NVOL()); //distância entre fronteiras adjacentes GeracaoMalha (xFronteiras, xCentro, DistCentro, DistFace, v1); std :: vector <Real> Ap(v1.NVOL()), Ae(v1.NVOL()), Aw(v1.NVOL()), Sp(v1.NVOL()); CalculaCoeficientes (DistFace, DistCentro, Ap, Ae, Aw, Sp); for (int i=0; i<v1.NVOL(); i++) { std :: cout << "Ae[" << i << "]=" << Ap[i] << std :: endl; } }
TSIL_COMPLEX Aeps (TSIL_REAL x, TSIL_REAL qq) { TSIL_COMPLEX lnbarx = Ap (x, qq); if (TSIL_FABS(x) < TSIL_TOL) return 0.0L; else return x * (-1.0L - 0.5L*Zeta2 + lnbarx - 0.5L*lnbarx*lnbarx); }
CGSolve( const ImportType & import , const SparseMatrixType & A , const VectorType & b , const VectorType & x , const size_t maximum_iteration = 200 , const double tolerance = std::numeric_limits<double>::epsilon() ) : iteration(0) , iter_time(0) , norm_res(0) { const size_t count_owned = import.count_owned ; const size_t count_total = import.count_owned + import.count_receive; // Need input vector to matvec to be owned + received VectorType pAll ( "cg::p" , count_total ); VectorType p = Kokkos::subview< VectorType >( pAll , std::pair<size_t,size_t>(0,count_owned) ); VectorType r ( "cg::r" , count_owned ); VectorType Ap( "cg::Ap", count_owned ); /* r = b - A * x ; */ /* p = x */ Kokkos::deep_copy( p , x ); /* import p */ import( pAll ); /* Ap = A * p */ Kokkos::MV_Multiply( Ap , A , pAll ); /* b - Ap => r */ Kokkos::V_Add( r , 1.0 , b , -1.0 , Ap ); /* p = r */ Kokkos::deep_copy( p , r ); double old_rdot = Kokkos::Example::all_reduce( Kokkos::V_Dot( r , r ) , import.comm ); norm_res = sqrt( old_rdot ); iteration = 0 ; Kokkos::Impl::Timer wall_clock ; while ( tolerance < norm_res && iteration < maximum_iteration ) { /* pAp_dot = dot( p , Ap = A * p ) */ /* import p */ import( pAll ); /* Ap = A * p */ Kokkos::MV_Multiply( Ap , A , pAll ); const double pAp_dot = Kokkos::Example::all_reduce( Kokkos::V_Dot( p , Ap ) , import.comm ); const double alpha = old_rdot / pAp_dot ; /* x += alpha * p ; */ Kokkos::V_Add( x , alpha, p , 1.0 , x ); /* r += -alpha * Ap ; */ Kokkos::V_Add( r , -alpha, Ap , 1.0 , r ); const double r_dot = Kokkos::Example::all_reduce( Kokkos::V_Dot( r , r ) , import.comm ); const double beta = r_dot / old_rdot ; /* p = r + beta * p ; */ Kokkos::V_Add( p , 1.0 , r , beta , p ); norm_res = sqrt( old_rdot = r_dot ); ++iteration ; } iter_time = wall_clock.seconds(); }
void cgsolve( const CrsMatrix<AScalarType,Device> & A , const View<VScalarType*,LayoutRight,Device> & b , const View<VScalarType*,LayoutRight,Device> & x , size_t & iteration , double & normr , double & iter_time , const size_t maximum_iteration = 200 , const double tolerance = std::numeric_limits<VScalarType>::epsilon() ) { typedef View<VScalarType*,LayoutRight,Device> vector_type ; const size_t count = b.dimension_0(); vector_type p ( "cg::p" , count ); vector_type r ( "cg::r" , count ); vector_type Ap( "cg::Ap", count ); /* r = b - A * x ; */ /* p = x */ deep_copy( p , x ); /* Ap = A * p */ multiply( A , p , Ap ); /* r = b - Ap */ waxpby( count , 1.0 , b , -1.0 , Ap , r ); /* p = r */ deep_copy( p , r ); double old_rdot = dot( count , r ); normr = std::sqrt( old_rdot ); iteration = 0 ; Kokkos::Impl::Timer wall_clock ; while ( tolerance < normr && iteration < maximum_iteration ) { /* pAp_dot = dot( p , Ap = A * p ) */ /* Ap = A * p */ multiply( A , p , Ap ); const double pAp_dot = dot( count , p , Ap ); const double alpha = old_rdot / pAp_dot ; /* x += alpha * p ; */ axpy( count, alpha, p , x ); /* r -= alpha * Ap ; */ axpy( count, -alpha, Ap, r ); const double r_dot = dot( count , r ); const double beta = r_dot / old_rdot ; /* p = r + beta * p ; */ xpby( count , r , beta , p ); normr = std::sqrt( old_rdot = r_dot ); ++iteration ; } iter_time = wall_clock.seconds(); }
static int32_t pprz_thd(void *arg) { /* To be compatible with rtos architecture, each of this 4 workers should be implemented in differents threads, each of them waiting for job to be done: periodic task should sleep, and event task should wait for event */ (void) arg; chibios_chRegSetThreadName("pprz big loop"); while (!chThdShouldTerminate()) { Fbw(handle_periodic_tasks); Ap(handle_periodic_tasks); Fbw(event_task); Ap(event_task); chibios_chThdSleepMilliseconds(1); } return 0; }
void RpalParser::Af() { pushProc("Af()"); Ap(); while (_nt == "**") { read_token(_nt); Af(); build("**", 2); } popProc("Af()"); }
rtems_task Init( rtems_task_argument ignored ){ #ifndef SERIO_TESTING Fbw(init); Ap(init); while (1) { update_bat(12.0); Fbw(handle_periodic_tasks); Ap(handle_periodic_tasks); Fbw(event_task); Ap(event_task); } #else UART1Init(); while(1){ Ap(event_task); } #endif return ; }
void step() { Vector Ap(mA*mP); Scalar alpha = mRNorm2 / mP.dot(Ap); mx += alpha * mP; mR -= alpha * Ap; Scalar newRNorm2 = mR.squaredNorm(); Scalar beta = newRNorm2 / mRNorm2; mRNorm2 = newRNorm2; mP *= beta; mP += mR; mIt++; }
Expression* sintactico::Ap() { if(CurrentToken->tipo == OR) { CurrentToken = Lexer->NexToken(); Expression* izq = B(); Expression* der = Ap(); if(der != NULL) return new ExprOr(izq, der,Lexer->linea); return izq; } return NULL; }
double RkPdf::MfRk(const double& x, const double& tau, const double& dm){ const double r_ckak = ck/ak; const double inv_ak = 1.0/ak; const double ndmtau = r_ckak*dm*tau; const double fact = 1.0/(1.0+ndmtau*ndmtau); double Li = 0.0; if(x<0.0){ const double ndm = dm/(ak-ck); const double ntau = tau*(ak-ck); Li = inv_ak*fact*(Mn(x,ntau,ndm)+ndmtau*An(x,ntau,ndm)); }else{ const double ndm = dm/(ak+ck); const double ntau = tau*(ak+ck); Li = inv_ak*fact*(Mp(x,ntau,ndm)+ndmtau*Ap(x,ntau,ndm)); } return Li; }
/** * Main function */ int main(void) { // Init Fbw(init); Ap(init); chThdSleepMilliseconds(100); // Create threads apThdPtr = chThdCreateStatic(wa_thd_ap, sizeof(wa_thd_ap), NORMALPRIO, thd_ap, NULL); fbwThdPtr = chThdCreateStatic(wa_thd_fbw, sizeof(wa_thd_fbw), NORMALPRIO, thd_fbw, NULL); // Main loop, do nothing while (TRUE) { chThdSleepMilliseconds(1000); } return 0; }
SparseVector<double> SparseSolverEigenCustom::cgSolveSparse(const SparseMatrix<double> & A,const SparseVector<double> & b,int iter, double residual) { SparseVector<double> r(b.rows()); SparseVector<double> p(b.rows()); SparseVector<double> Ap(b.rows()); SparseVector<double> x(b.rows()); r = b - A *x; p = r; double rTr,pTAp,alpha,beta,rTrnew,rnorm; SparseVector<double> vtemp; bool isConverged = false; for(int k=0;k<iter;k++) { Ap = A*p; vtemp = r.transpose()*r; rTr = vtemp.coeff(0); vtemp = p.transpose()*Ap; pTAp = vtemp.coeff(0); alpha = rTr/pTAp; x = x + (alpha * p); r = r - (alpha * Ap); rnorm = r.norm(); if(rnorm<residual) { isConverged = true; break; } vtemp = r.transpose()*r; rTrnew = vtemp.coeff(0); beta = rTrnew / rTr; p = r + (beta * p); } return x; }
void solve(T &A, Vector &x, Vector &b,U &M) //T: tipo de matriz de entrada, //U: tipo de precondicinador de entrada { timer.tic(); //comienza a medir tiempo de ejecucion double tol2 = mtol*mtol; int n = x.size(); Vector r(n),z(n),p(n),Ap(n); double alpha; double beta; double rr0,rr; int k; r = b-A*x; //primer residual operacion vector = vector - matriz*vector M.solve(z,r); //resuleve el sistema Mz=r p=z; //copia de vector rr0 = z*r; //producto interno de dos vectores for(k =0;k<mmaxIts;++k) { Ap = A*p; //producto matriz vector alpha = (rr0) / (Ap*p); //producto interno en el divisor x = x + alpha*p; //operacion vector = vector + escalar*vector (saxpy en BLAS) r = r - alpha*Ap; M.solve(z,r); //resuleve el sistema Mz=r rr = z*r; //producto interno merror = r*r; //producto interno if(merror < tol2) break; beta = rr / rr0; p = z + beta*p; //operacion vector = vector + escalar*vector rr0 = rr; } mits = k+1; merror = sqrt(merror); timer.toc(); //termina de medir tiempo etime = timer.etime(); //guarda en etime el tiempo de ejecucion precond = true; namep = M.name(); //guarda el nombre del precondicionador }
int main(void) { // Init sys_time_init(); // Init ChibiOS sdlogOk = chibios_init(); // Init PPRZ Fbw(init); Ap(init); chibios_chThdSleepMilliseconds(100); launch_pprz_thd(&pprz_thd); pprzReady = true; // Call PPRZ periodic and event functions while (TRUE) { chibios_chThdSleepMilliseconds(1000); } return 0; }
void solve(T &A, Vector &x, Vector &b) { timer.tic(); double tol2 = mtol*mtol; int n = x.size(); Vector r(n); Vector p(n); Vector Ap(n); double alpha; double beta; double rr0,rr; int k; r = b-A*x; rr0 = r*r; p=r; for(k =0;k<mmaxIts;++k) { Ap = A*p; alpha = (rr0) / (Ap*p); x = x + alpha*p; r = r - alpha*Ap; rr = r*r; merror = rr; if(merror < tol2) break; beta = rr / rr0; p = r + beta*p; rr0 = rr; } mits = k+1; merror = sqrt(merror); timer.toc(); etime = timer.etime(); precond = false; }
void VanDerWaals::generateTriangle(Data::Mesh& mesh, Data::OMMesh::VertexHandle const& Av, Data::OMMesh::VertexHandle const& Bv, Data::OMMesh::VertexHandle const& Cv, int div) { if (div <= 0) { mesh.addFace(Cv, Bv, Av); }else { Data::OMMesh::Point Ap(mesh.vertex(Av)); Data::OMMesh::Point Bp(mesh.vertex(Bv)); Data::OMMesh::Point Cp(mesh.vertex(Cv)); // create 3 new vertices at the edge midpoints Data::OMMesh::Point ABp((Ap+Bp)*0.5); Data::OMMesh::Point BCp((Bp+Cp)*0.5); Data::OMMesh::Point CAp((Cp+Ap)*0.5); // Normalize the midpoints to keep them on the sphere ABp.normalize(); BCp.normalize(); CAp.normalize(); Data::OMMesh::VertexHandle ABv(mesh.addVertex(ABp)); Data::OMMesh::VertexHandle BCv(mesh.addVertex(BCp)); Data::OMMesh::VertexHandle CAv(mesh.addVertex(CAp)); mesh.setNormal(ABv, ABp); mesh.setNormal(BCv, BCp); mesh.setNormal(CAv, CAp); generateTriangle(mesh, Av, ABv, CAv, div-1); generateTriangle(mesh, Bv, BCv, ABv, div-1); generateTriangle(mesh, Cv, CAv, BCv, div-1); generateTriangle(mesh, ABv, BCv, CAv, div-1); //<-- Remove for serpinski } }
SGVector<complex128_t> CCGMShiftedFamilySolver::solve_shifted_weighted( CLinearOperator<SGVector<float64_t>, SGVector<float64_t> >* A, SGVector<float64_t> b, SGVector<complex128_t> shifts, SGVector<complex128_t> weights) { SG_DEBUG("Entering\n"); // sanity check REQUIRE(A, "Operator is NULL!\n"); REQUIRE(A->get_dimension()==b.vlen, "Dimension mismatch! [%d vs %d]\n", A->get_dimension(), b.vlen); REQUIRE(shifts.vector,"Shifts are not initialized!\n"); REQUIRE(weights.vector,"Weights are not initialized!\n"); REQUIRE(shifts.vlen==weights.vlen, "Number of shifts and number of " "weights are not equal! [%d vs %d]\n", shifts.vlen, weights.vlen); // the solution matrix, one column per shift, initial guess 0 for all MatrixXcd x_sh=MatrixXcd::Zero(b.vlen, shifts.vlen); MatrixXcd p_sh=MatrixXcd::Zero(b.vlen, shifts.vlen); // non-shifted direction SGVector<float64_t> p_(b.vlen); // the rest of the part hinges on eigen3 for computing norms Map<VectorXd> b_map(b.vector, b.vlen); Map<VectorXd> p(p_.vector, p_.vlen); // residual r_i=b-Ax_i, here x_0=[0], so r_0=b VectorXd r=b_map; // initial direction is same as residual p=r; p_sh=r.replicate(1, shifts.vlen).cast<complex128_t>(); // non shifted initializers float64_t r_norm2=r.dot(r); float64_t beta_old=1.0; float64_t alpha=1.0; // shifted quantities SGVector<complex128_t> alpha_sh(shifts.vlen); SGVector<complex128_t> beta_sh(shifts.vlen); SGVector<complex128_t> zeta_sh_old(shifts.vlen); SGVector<complex128_t> zeta_sh_cur(shifts.vlen); SGVector<complex128_t> zeta_sh_new(shifts.vlen); // shifted initializers zeta_sh_old.set_const(1.0); zeta_sh_cur.set_const(1.0); // the iterator for this iterative solver IterativeSolverIterator<float64_t> it(r, m_max_iteration_limit, m_relative_tolerence, m_absolute_tolerence); // start the timer CTime time; time.start(); // set the residuals to zero if (m_store_residuals) m_residuals.set_const(0.0); // CG iteration begins for (it.begin(r); !it.end(r); ++it) { SG_DEBUG("CG iteration %d, residual norm %f\n", it.get_iter_info().iteration_count, it.get_iter_info().residual_norm); if (m_store_residuals) { m_residuals[it.get_iter_info().iteration_count] =it.get_iter_info().residual_norm; } // apply linear operator to the direction vector SGVector<float64_t> Ap_=A->apply(p_); Map<VectorXd> Ap(Ap_.vector, Ap_.vlen); // compute p^{T}Ap, if zero, failure float64_t p_dot_Ap=p.dot(Ap); if (p_dot_Ap==0.0) break; // compute the beta parameter of CG_M float64_t beta=-r_norm2/p_dot_Ap; // compute the zeta-shifted parameter of CG_M compute_zeta_sh_new(zeta_sh_old, zeta_sh_cur, shifts, beta_old, beta, alpha, zeta_sh_new); // compute beta-shifted parameter of CG_M compute_beta_sh(zeta_sh_new, zeta_sh_cur, beta, beta_sh); // update the solution vector and residual for (index_t i=0; i<shifts.vlen; ++i) x_sh.col(i)-=beta_sh[i]*p_sh.col(i); // r_{i}=r_{i-1}+\beta_{i}Ap r+=beta*Ap; // compute new ||r||_{2}, if zero, converged float64_t r_norm2_i=r.dot(r); if (r_norm2_i==0.0) break; // compute the alpha parameter of CG_M alpha=r_norm2_i/r_norm2; // update ||r||_{2} r_norm2=r_norm2_i; // update direction p=r+alpha*p; compute_alpha_sh(zeta_sh_new, zeta_sh_cur, beta_sh, beta, alpha, alpha_sh); for (index_t i=0; i<shifts.vlen; ++i) { p_sh.col(i)*=alpha_sh[i]; p_sh.col(i)+=zeta_sh_new[i]*r; } // update parameters for (index_t i=0; i<shifts.vlen; ++i) { zeta_sh_old[i]=zeta_sh_cur[i]; zeta_sh_cur[i]=zeta_sh_new[i]; } beta_old=beta; } float64_t elapsed=time.cur_time_diff(); if (!it.succeeded(r)) SG_WARNING("Did not converge!\n"); SG_INFO("Iteration took %d times, residual norm=%.20lf, time elapsed=%f\n", it.get_iter_info().iteration_count, it.get_iter_info().residual_norm, elapsed); // compute the final result vector multiplied by weights SGVector<complex128_t> result(b.vlen); result.set_const(0.0); Map<VectorXcd> x(result.vector, result.vlen); for (index_t i=0; i<x_sh.cols(); ++i) x+=x_sh.col(i)*weights[i]; SG_DEBUG("Leaving\n"); return result; }
void CG::operator()(cudaColorSpinorField &x, cudaColorSpinorField &b) { int k=0; int rUpdate = 0; cudaColorSpinorField r(b); ColorSpinorParam param(x); param.create = QUDA_ZERO_FIELD_CREATE; cudaColorSpinorField y(b, param); mat(r, x, y); zeroCuda(y); double r2 = xmyNormCuda(b, r); rUpdate ++; param.precision = invParam.cuda_prec_sloppy; cudaColorSpinorField Ap(x, param); cudaColorSpinorField tmp(x, param); cudaColorSpinorField tmp2(x, param); // only needed for clover and twisted mass cudaColorSpinorField *x_sloppy, *r_sloppy; if (invParam.cuda_prec_sloppy == x.Precision()) { param.create = QUDA_REFERENCE_FIELD_CREATE; x_sloppy = &x; r_sloppy = &r; } else { param.create = QUDA_COPY_FIELD_CREATE; x_sloppy = new cudaColorSpinorField(x, param); r_sloppy = new cudaColorSpinorField(r, param); } cudaColorSpinorField &xSloppy = *x_sloppy; cudaColorSpinorField &rSloppy = *r_sloppy; cudaColorSpinorField p(rSloppy); double r2_old; double src_norm = norm2(b); double stop = src_norm*invParam.tol*invParam.tol; // stopping condition of solver double alpha, beta; double pAp; double rNorm = sqrt(r2); double r0Norm = rNorm; double maxrx = rNorm; double maxrr = rNorm; double delta = invParam.reliable_delta; if (invParam.verbosity >= QUDA_VERBOSE) printfQuda("CG: %d iterations, r2 = %e\n", k, r2); quda::blas_flops = 0; stopwatchStart(); while (r2 > stop && k<invParam.maxiter) { matSloppy(Ap, p, tmp, tmp2); // tmp as tmp pAp = reDotProductCuda(p, Ap); alpha = r2 / pAp; r2_old = r2; r2 = axpyNormCuda(-alpha, Ap, rSloppy); // reliable update conditions rNorm = sqrt(r2); if (rNorm > maxrx) maxrx = rNorm; if (rNorm > maxrr) maxrr = rNorm; int updateX = (rNorm < delta*r0Norm && r0Norm <= maxrx) ? 1 : 0; int updateR = ((rNorm < delta*maxrr && r0Norm <= maxrr) || updateX) ? 1 : 0; if (!(updateR || updateX)) { beta = r2 / r2_old; axpyZpbxCuda(alpha, p, xSloppy, rSloppy, beta); } else { axpyCuda(alpha, p, xSloppy); if (x.Precision() != xSloppy.Precision()) copyCuda(x, xSloppy); xpyCuda(x, y); // swap these around? mat(r, y, x); // here we can use x as tmp r2 = xmyNormCuda(b, r); if (x.Precision() != rSloppy.Precision()) copyCuda(rSloppy, r); zeroCuda(xSloppy); rNorm = sqrt(r2); maxrr = rNorm; maxrx = rNorm; r0Norm = rNorm; rUpdate++; beta = r2 / r2_old; xpayCuda(rSloppy, beta, p); } k++; if (invParam.verbosity >= QUDA_VERBOSE) printfQuda("CG: %d iterations, r2 = %e\n", k, r2); } if (x.Precision() != xSloppy.Precision()) copyCuda(x, xSloppy); xpyCuda(y, x); invParam.secs = stopwatchReadSeconds(); if (k==invParam.maxiter) warningQuda("Exceeded maximum iterations %d", invParam.maxiter); if (invParam.verbosity >= QUDA_SUMMARIZE) printfQuda("CG: Reliable updates = %d\n", rUpdate); double gflops = (quda::blas_flops + mat.flops() + matSloppy.flops())*1e-9; reduceDouble(gflops); // printfQuda("%f gflops\n", gflops / stopwatchReadSeconds()); invParam.gflops = gflops; invParam.iter = k; quda::blas_flops = 0; if (invParam.verbosity >= QUDA_SUMMARIZE){ mat(r, x, y); double true_res = xmyNormCuda(b, r); printfQuda("CG: Converged after %d iterations, relative residua: iterated = %e, true = %e\n", k, sqrt(r2/src_norm), sqrt(true_res / src_norm)); } if (invParam.cuda_prec_sloppy != x.Precision()) { delete r_sloppy; delete x_sloppy; } return; }
int CrsMatrixTranspose( Epetra_CrsMatrix *In, Epetra_CrsMatrix *Out ) { int iam = In->Comm().MyPID() ; int numentries = In->NumGlobalNonzeros(); int NumRowEntries = 0; double *RowValues = 0; int *ColIndices = 0; int numrows = In->NumGlobalRows(); int numcols = In->NumGlobalCols(); std::vector <int> Ap( numcols+1 ); // Column i is stored in Aval(Ap[i]..Ap[i+1]-1) std::vector <int> nextAp( numcols+1 ); // Where to store next value in Column i std::vector <int> Ai( EPETRA_MAX( numcols, numentries) ) ; // Row indices std::vector <double> Aval( EPETRA_MAX( numcols, numentries) ) ; if ( iam == 0 ) { assert( In->NumMyRows() == In->NumGlobalRows() ) ; // // Count the number of entries in each column // std::vector <int>RowsPerCol( numcols ) ; for ( int i = 0 ; i < numcols ; i++ ) RowsPerCol[i] = 0 ; for ( int MyRow = 0; MyRow <numrows; MyRow++ ) { assert( In->ExtractMyRowView( MyRow, NumRowEntries, RowValues, ColIndices ) == 0 ) ; for ( int j = 0; j < NumRowEntries; j++ ) { RowsPerCol[ ColIndices[j] ] ++ ; } } // // Set Ap and nextAp based on RowsPerCol // Ap[0] = 0 ; for ( int i = 0 ; i < numcols ; i++ ) { Ap[i+1]= Ap[i] + RowsPerCol[i] ; nextAp[i] = Ap[i]; } // // Populate Ai and Aval // for ( int MyRow = 0; MyRow <numrows; MyRow++ ) { assert( In->ExtractMyRowView( MyRow, NumRowEntries, RowValues, ColIndices ) == 0 ) ; for ( int j = 0; j < NumRowEntries; j++ ) { Ai[ nextAp[ ColIndices[j] ] ] = MyRow ; Aval[ nextAp[ ColIndices[j] ] ] = RowValues[j] ; nextAp[ ColIndices[j] ] ++ ; } } // // Insert values into Out // for ( int MyRow = 0; MyRow <numrows; MyRow++ ) { int NumInCol = Ap[MyRow+1] - Ap[MyRow] ; Out->InsertGlobalValues( MyRow, NumInCol, &Aval[Ap[MyRow]], &Ai[Ap[MyRow]] ); assert( Out->IndicesAreGlobal() ) ; } } else { assert( In->NumMyRows() == 0 ) ; } assert( Out->FillComplete()==0 ) ; return 0 ; }
// CG int CG(const Teuchos::SerialDenseMatrix<int, double> & A, Teuchos::SerialDenseMatrix<int,double> X,const Teuchos::SerialDenseMatrix<int,double> & B, int max_iter, double tolerance, Stokhos::DiagPreconditioner<int,double> prec) { int n; int k=0; double resid; n=A.numRows(); std::cout << "A= " << A << std::endl; std::cout << "B= " << B << std::endl; Teuchos::SerialDenseMatrix<int, double> Ax(n,1); Ax.multiply(Teuchos::NO_TRANS,Teuchos::NO_TRANS,1.0, A, X, 0.0); Teuchos::SerialDenseMatrix<int, double> r(B); r-=Ax; resid=r.normFrobenius(); Teuchos::SerialDenseMatrix<int, double> rho(1,1); Teuchos::SerialDenseMatrix<int, double> oldrho(1,1); Teuchos::SerialDenseMatrix<int, double> pAp(1,1); Teuchos::SerialDenseMatrix<int, double> Ap(n,1); double b; double a; Teuchos::SerialDenseMatrix<int, double> p(r); while (resid > tolerance && k < max_iter){ Teuchos::SerialDenseMatrix<int, double> z(r); //z=M-1r // prec.ApplyInverse(r,z); rho.multiply(Teuchos::TRANS,Teuchos::NO_TRANS,1.0, r, z, 0.0); if (k==0){ p.assign(z); rho.multiply(Teuchos::TRANS, Teuchos::NO_TRANS, 1.0, r, z, 0.0); } else { b=rho(0,0)/oldrho(0,0); p.scale(b); p+=z; } Ap.multiply(Teuchos::NO_TRANS,Teuchos::NO_TRANS,1.0, A, p, 0.0); pAp.multiply(Teuchos::TRANS,Teuchos::NO_TRANS,1.0, p, Ap, 0.0); a=rho(0,0)/pAp(0,0); Teuchos::SerialDenseMatrix<int, double> scalep(p); scalep.scale(a); X+=scalep; Ap.scale(a); r-=Ap; oldrho.assign(rho); resid=r.normFrobenius(); k++; } std::cout << "X= " << X << std::endl; return 0; }
SGVector<float64_t> CConjugateGradientSolver::solve( CLinearOperator<float64_t>* A, SGVector<float64_t> b) { SG_DEBUG("CConjugateGradientSolve::solve(): Entering..\n"); // sanity check REQUIRE(A, "Operator is NULL!\n"); REQUIRE(A->get_dimension()==b.vlen, "Dimension mismatch!\n"); // the final solution vector, initial guess is 0 SGVector<float64_t> result(b.vlen); result.set_const(0.0); // the rest of the part hinges on eigen3 for computing norms Map<VectorXd> x(result.vector, result.vlen); Map<VectorXd> b_map(b.vector, b.vlen); // direction vector SGVector<float64_t> p_(result.vlen); Map<VectorXd> p(p_.vector, p_.vlen); // residual r_i=b-Ax_i, here x_0=[0], so r_0=b VectorXd r=b_map; // initial direction is same as residual p=r; // the iterator for this iterative solver IterativeSolverIterator<float64_t> it(b_map, m_max_iteration_limit, m_relative_tolerence, m_absolute_tolerence); // CG iteration begins float64_t r_norm2=r.dot(r); // start the timer CTime time; time.start(); // set the residuals to zero if (m_store_residuals) m_residuals.set_const(0.0); for (it.begin(r); !it.end(r); ++it) { SG_DEBUG("CG iteration %d, residual norm %f\n", it.get_iter_info().iteration_count, it.get_iter_info().residual_norm); if (m_store_residuals) { m_residuals[it.get_iter_info().iteration_count] =it.get_iter_info().residual_norm; } // apply linear operator to the direction vector SGVector<float64_t> Ap_=A->apply(p_); Map<VectorXd> Ap(Ap_.vector, Ap_.vlen); // compute p^{T}Ap, if zero, failure float64_t p_dot_Ap=p.dot(Ap); if (p_dot_Ap==0.0) break; // compute the alpha parameter of CG float64_t alpha=r_norm2/p_dot_Ap; // update the solution vector and residual // x_{i}=x_{i-1}+\alpha_{i}p x+=alpha*p; // r_{i}=r_{i-1}-\alpha_{i}p r-=alpha*Ap; // compute new ||r||_{2}, if zero, converged float64_t r_norm2_i=r.dot(r); if (r_norm2_i==0.0) break; // compute the beta parameter of CG float64_t beta=r_norm2_i/r_norm2; // update direction, and ||r||_{2} r_norm2=r_norm2_i; p=r+beta*p; } float64_t elapsed=time.cur_time_diff(); if (!it.succeeded(r)) SG_WARNING("Did not converge!\n"); SG_INFO("Iteration took %ld times, residual norm=%.20lf, time elapsed=%lf\n", it.get_iter_info().iteration_count, it.get_iter_info().residual_norm, elapsed); SG_DEBUG("CConjugateGradientSolve::solve(): Leaving..\n"); return result; }
inline void pcgsolve( //const ImportType & import, KernelHandle &kh , const CrsMatrix <typename KernelHandle::nonzero_value_type , typename KernelHandle::row_index_type, typename KernelHandle::HandleExecSpace > & A , const Kokkos::View <typename KernelHandle::nonzero_value_type *, typename KernelHandle::HandleExecSpace> & b , const Kokkos::View <typename KernelHandle::nonzero_value_type * , typename KernelHandle::HandleExecSpace > & x , const size_t maximum_iteration = 200 , const double tolerance = std::numeric_limits<double>::epsilon() , CGSolveResult * result = 0 , bool use_sgs = true ) { typedef typename KernelHandle::HandleExecSpace Space; //typedef typename KernelHandle::nonzero_value_type MScalar; typedef typename KernelHandle::nonzero_value_type VScalar; //typedef typename KernelHandle::row_index_type Idx_Type; //typedef typename KernelHandle::idx_array_type idx_array_type; typedef typename Kokkos::View< VScalar * , Space > VectorType ; //const size_t count_owned = import.count_owned ; //const size_t count_total = import.count_owned + import.count_receive; const size_t count_owned = A.graph.nv; const size_t count_total = count_owned; size_t iteration = 0 ; double iter_time = 0 ; double matvec_time = 0 ; double norm_res = 0 ; double precond_time = 0; double precond_init_time = 0; Kokkos::Impl::Timer wall_clock ; Kokkos::Impl::Timer timer; // Need input vector to matvec to be owned + received VectorType pAll ( "cg::p" , count_total ); VectorType p = Kokkos::subview( pAll , std::pair<size_t,size_t>(0,count_owned) ); VectorType r ( "cg::r" , count_owned ); VectorType Ap( "cg::Ap", count_owned ); /* r = b - A * x ; */ /* p = x */ Kokkos::deep_copy( p , x ); ///* import p */ import( pAll ); /* Ap = A * p */ multiply( count_owned , Ap , A , pAll ); /* r = b - Ap */ waxpby( count_owned , r , 1.0 , b , -1.0 , Ap ); /* p = r */ Kokkos::deep_copy( p , r ); //double old_rdot = Kokkos::Example::all_reduce( dot( count_owned , r , r ) , import.comm ); double old_rdot = dot( count_owned , r , r ); norm_res = sqrt( old_rdot ); int apply_count = 1; VectorType z; //double precond_old_rdot = Kokkos::Example::all_reduce( dot( count_owned , r , z ) , import.comm ); double precond_old_rdot = 1; #ifdef PRECOND_NORM double precond_norm_res = 1; #endif Kokkos::deep_copy( p , z ); //typename KernelHandle::GaussSeidelHandleType *gsHandler; bool owner_handle = false; if (use_sgs){ if (kh.get_gs_handle() == NULL){ owner_handle = true; kh.create_gs_handle(); } //gsHandler = kh.get_gs_handle(); timer.reset(); KokkosKernels::Experimental::Graph::gauss_seidel_numeric (&kh, count_owned, count_owned, A.graph.row_map, A.graph.entries, A.coeff); Space::fence(); precond_init_time += timer.seconds(); z = VectorType( "pcg::z" , count_owned ); Space::fence(); timer.reset(); KokkosKernels::Experimental::Graph::symmetric_gauss_seidel_apply (&kh, count_owned, count_owned, A.graph.row_map, A.graph.entries, A.coeff, z, r, true, apply_count); Space::fence(); precond_time += timer.seconds(); //double precond_old_rdot = Kokkos::Example::all_reduce( dot( count_owned , r , z ) , import.comm ); precond_old_rdot = dot( count_owned , r , z ); #ifdef PRECOND_NORM precond_norm_res = sqrt( precond_old_rdot ); #endif Kokkos::deep_copy( p , z ); } iteration = 0 ; #ifdef PRINTRES std::cout << "norm_res:" << norm_res << " old_rdot:" << old_rdot<< std::endl; #ifdef PRECOND_NORM if (use_sgs) std::cout << "precond_norm_res:" << precond_norm_res << " precond_old_rdot:" << precond_old_rdot<< std::endl; #endif #endif while ( tolerance < norm_res && iteration < maximum_iteration ) { /* pAp_dot = dot( p , Ap = A * p ) */ timer.reset(); ///* import p */ import( pAll ); /* Ap = A * p */ multiply( count_owned , Ap , A , pAll ); Space::fence(); matvec_time += timer.seconds(); //const double pAp_dot = Kokkos::Example::all_reduce( dot( count_owned , p , Ap ) , import.comm ); const double pAp_dot = dot( count_owned , p , Ap ) ; double alpha = 0; if (use_sgs){ alpha = precond_old_rdot / pAp_dot ; } else { alpha = old_rdot / pAp_dot ; } /* x += alpha * p ; */ waxpby( count_owned , x , alpha, p , 1.0 , x ); /* r += -alpha * Ap ; */ waxpby( count_owned , r , -alpha, Ap , 1.0 , r ); //const double r_dot = Kokkos::Example::all_reduce( dot( count_owned , r , r ) , import.comm ); const double r_dot = dot( count_owned , r , r ); const double beta_original = r_dot / old_rdot ; double precond_r_dot = 1; double precond_beta = 1; if (use_sgs){ Space::fence(); timer.reset(); KokkosKernels::Experimental::Graph::symmetric_gauss_seidel_apply(&kh, count_owned, count_owned, A.graph.row_map, A.graph.entries, A.coeff, z, r, true, apply_count); Space::fence(); precond_time += timer.seconds(); //const double precond_r_dot = Kokkos::Example::all_reduce( dot( count_owned , r , z ) , import.comm ); precond_r_dot = dot( count_owned , r , z ); precond_beta = precond_r_dot / precond_old_rdot ; } double beta = 1; if (!use_sgs){ beta = beta_original; /* p = r + beta * p ; */ waxpby( count_owned , p , 1.0 , r , beta , p ); } else { beta = precond_beta; waxpby( count_owned , p , 1.0 , z , beta , p ); } #ifdef PRINTRES std::cout << "\tbeta_original:" << beta_original << std::endl; if (use_sgs) std::cout << "\tprecond_beta:" << precond_beta << std::endl; #endif norm_res = sqrt( old_rdot = r_dot ); #ifdef PRECOND_NORM if (use_sgs){ precond_norm_res = sqrt( precond_old_rdot = precond_r_dot ); } #else precond_old_rdot = precond_r_dot; #endif #ifdef PRINTRES std::cout << "\tnorm_res:" << norm_res << " old_rdot:" << old_rdot<< std::endl; #ifdef PRECOND_NORM if (use_sgs) std::cout << "\tprecond_norm_res:" << precond_norm_res << " precond_old_rdot:" << precond_old_rdot<< std::endl; #endif #endif ++iteration ; } Space::fence(); iter_time = wall_clock.seconds(); if ( 0 != result ) { result->iteration = iteration ; result->iter_time = iter_time ; result->matvec_time = matvec_time ; result->norm_res = norm_res ; result->precond_time = precond_time; result->precond_init_time = precond_init_time; } if (use_sgs & owner_handle ){ kh.destroy_gs_handle(); } }
/** * @function calculateTrifocalTensor */ void trifocalTensor::calculateTrifocalTensor() { Eigen::JacobiSVD<Eigen::MatrixXf> svd( mEq, Eigen::ComputeThinU | Eigen::ComputeThinV ); Eigen::MatrixXf V = svd.matrixV(); printf("* V has %d rows and %d cols \n", V.rows(), V.cols() ); Eigen::FullPivLU<Eigen::MatrixXf> lu(mEq); printf("* Rank of mEq is: %d \n", lu.rank() ); //std::cout << "Columns are nullspace : " << std::endl; //std::cout<< lu.kernel() << std::endl; //Eigen::MatrixXf kernel = lu.kernel(); //mmEq(mPointer, ToIndex1(3,1,2)=;3 = kernel.col( kernel.cols() - 1 ); // Eigen::MatrixXf Vt = V.transpose(); mmEq(mPointer, ToIndex1(3,1,2)=;3 = Vt.col( Vt.cols() - 1 ); mT123 = V.col( V.cols() - 1 ); printf("mT123: Rows: %d cols: %d \n", mT123.rows(), mT123.cols() ); // Saving them properly mT.resize(0); Eigen::MatrixXf T1(3,3); T1(0,0) = mT123(0,0); T1(0,1) = mT123(1,0); T1(0,2) = mT123(2,0); T1(1,0) = mT123(3,0); T1(1,1) = mT123(4,0); T1(1,2) = mT123(5,0); T1(2,0) = mT123(6,0); T1(2,1) = mT123(7,0); T1(2,2) = mT123(8,0); mT.push_back(T1); printf("Saved T1 \n"); Eigen::MatrixXf T2(3,3); T2(0,0) = mT123(9,0); T2(0,1) = mT123(10,0); T2(0,2) = mT123(11,0); T2(1,0) = mT123(12,0); T2(1,1) = mT123(13,0); T2(1,2) = mT123(14,0); T2(2,0) = mT123(15,0); T2(2,1) = mT123(16,0); T2(2,2) = mT123(17,0); mT.push_back(T2); printf("Saved T2 \n"); Eigen::MatrixXf T3(3,3); T3(0,0) = mT123(18,0); T3(0,1) = mT123(19,0); T3(0,2) = mT123(20,0); T3(1,0) = mT123(21,0); T3(1,1) = mT123(22,0); T3(1,2) = mT123(23,0); T3(2,0) = mT123(24,0); T3(2,1) = mT123(25,0); T3(2,2) = mT123(26,0); mT.push_back(T3); printf("Saved T3 \n"); // Checking Eigen::MatrixXf res = mEq*mT123; std::cout << "Checking mEq*T: \n"<< res.transpose() << std::endl; // Making it with last guy = 1 // Normalizing for( int i = 0; i < mT.size(); ++i ) { float temp = mT[i](2,2); for( int j = 0; j < 3; ++j ) { for( int k = 0; k < 3; ++k ) { float orig = mT[i](j,k); mT[i](j,k) = orig / temp; } } } // Visualize for( int i = 0; i < mT.size(); ++i ) { std::cout << "T("<<i<<"): \n" << mT[i] << std::endl; } // Test lines for( int i = 0; i < mLLL.size(); ++i ) { Eigen::VectorXf A(3); Eigen::VectorXf B(3); Eigen::VectorXf C(3); Eigen::VectorXf Ap(3); A(0) = mLLL[i][0].x; A(1) = mLLL[i][0].y; A(2) = mLLL[i][0].z; B(0) = mLLL[i][1].x; B(1) = mLLL[i][1].y; B(2) = mLLL[i][1].z; C(0) = mLLL[i][2].x; C(1) = mLLL[i][2].y; C(2) = mLLL[i][2].z; Eigen::MatrixXf r0, r1, r2; Eigen::MatrixXf Tt; Tt = mT[0]; r0 = ( B.transpose() )*Tt*C; Ap(0) = r0(0,0); Tt = mT[1]; r1 = ( B.transpose() )*Tt*C; Ap(1) = r1(0,0); Tt = mT[2]; r2 = ( B.transpose() )*Tt*C; Ap(2) = r2(0,0); // Normalize Ap float temp = A(2) / Ap(2); float num; num = Ap(0)*temp; Ap(0) = num; num = Ap(1)*temp; Ap(1) = num; num = Ap(2)*temp; Ap(2) = num; std::cout <<" ("<<i<<") " <<" A: " << A.transpose() << std::endl; std::cout <<" ("<<i<<") " <<" Ap: " << Ap.transpose() << std::endl; } }
int main(int argc, char *argv[]){ Params params; std::map<std::string, std::string> args; readArgs(argc, argv, args); if(args.find("algo")!=args.end()){ params.algo = args["algo"]; }else{ params.algo = "qdMCNat"; } if(args.find("inst_file")!=args.end()) setParamsFromFile(args["inst_file"], args, params); else setParams(params.algo, args, params); createLogDir(params.dir_path); gen.seed(params.seed); // Load the dataset MyMatrix X_train, X_valid; VectorXd Y_train, Y_valid; loadMnist(params.ratio_train, X_train, X_valid, Y_train, Y_valid); //loadCIFAR10(params.ratio_train, X_train, X_valid, Y_train, Y_valid); //loadLightCIFAR10(params.ratio_train, X_train, X_valid, Y_train, Y_valid); // ConvNet parameters std::vector<ConvLayerParams> conv_params; ConvLayerParams conv_params1; conv_params1.Hf = 5; conv_params1.stride = 1; conv_params1.n_filter = 20; conv_params1.padding = 0; conv_params.push_back(conv_params1); ConvLayerParams conv_params2; conv_params2.Hf = 5; conv_params2.stride = 1; conv_params2.n_filter = 50; conv_params2.padding = 0; conv_params.push_back(conv_params2); std::vector<PoolLayerParams> pool_params; PoolLayerParams pool_params1; pool_params1.Hf = 2; pool_params1.stride = 2; pool_params.push_back(pool_params1); PoolLayerParams pool_params2; pool_params2.Hf = 2; pool_params2.stride = 2; pool_params.push_back(pool_params2); const unsigned n_conv_layer = conv_params.size(); for(unsigned l = 0; l < conv_params.size(); l++){ if(l==0){ conv_params[l].filter_size = conv_params[l].Hf * conv_params[l].Hf * params.img_depth; conv_params[l].N = (params.img_width - conv_params[l].Hf + 2*conv_params[l].padding)/conv_params[l].stride + 1; } else{ conv_params[l].filter_size = conv_params[l].Hf * conv_params[l].Hf * conv_params[l-1].n_filter; conv_params[l].N = (pool_params[l-1].N - conv_params[l].Hf + 2*conv_params[l].padding)/conv_params[l].stride + 1; } pool_params[l].N = (conv_params[l].N - pool_params[l].Hf)/pool_params[l].stride + 1; } // Neural Network parameters const unsigned n_training = X_train.rows(); const unsigned n_valid = X_valid.rows(); const unsigned n_feature = X_train.cols(); const unsigned n_label = Y_train.maxCoeff() + 1; params.nn_arch.insert(params.nn_arch.begin(),conv_params[n_conv_layer-1].n_filter * pool_params[n_conv_layer-1].N * pool_params[n_conv_layer-1].N); params.nn_arch.push_back(n_label); const unsigned n_layers = params.nn_arch.size(); // Optimization parameter const int n_train_batch = ceil(n_training/(float)params.train_minibatch_size); const int n_valid_batch = ceil(n_valid/(float)params.valid_minibatch_size); double prev_loss = std::numeric_limits<double>::max(); double eta = params.eta; // Create the convolutional layer std::vector<MyMatrix> conv_W(n_conv_layer); std::vector<MyMatrix> conv_W_T(n_conv_layer); std::vector<MyVector> conv_B(n_conv_layer); // Create the neural network MyMatrix W_out(params.nn_arch[n_layers-2],n_label); std::vector<MySpMatrix> W(n_layers-2); std::vector<MySpMatrix> Wt(n_layers-2); std::vector<MyVector> B(n_layers-1); double init_sigma = 0.; ActivationFunction act_func; ActivationFunction eval_act_func; if(params.act_func_name=="sigmoid"){ init_sigma = 4.0; act_func = std::bind(logistic,true,_1,_2,_3); eval_act_func = std::bind(logistic,false,_1,_2,_3); }else if(params.act_func_name=="tanh"){ init_sigma = 1.0; act_func = std::bind(my_tanh,true,_1,_2,_3); eval_act_func = std::bind(my_tanh,false,_1,_2,_3); }else if(params.act_func_name=="relu"){ init_sigma = 1.0; // TODO: Find the good value act_func = std::bind(relu,true,_1,_2,_3); eval_act_func = std::bind(relu,false,_1,_2,_3); }else{ std::cout << "Not implemented yet!" << std::endl; assert(false); } std::cout << "Initializing the network... "; params.n_params = initNetwork(params.nn_arch, params.act_func_name, params.sparsity, conv_params, pool_params, W_out, W, Wt, B, conv_W, conv_W_T, conv_B); // TODO: Init the conv bias // Deep copy of parameters for the adaptive rule std::vector<MyMatrix> mu_dW(n_layers-1); std::vector<MyVector> mu_dB(n_layers-1); MyMatrix pW_out = W_out; std::vector<MySpMatrix> pW = W; std::vector<MySpMatrix> pWt = Wt; std::vector<MyVector> pB = B; MyMatrix ppMii_out, ppM0i_out; MyVector ppM00_out; std::vector<MySpMatrix> ppMii,ppM0i; std::vector<MyVector> ppM00; MyMatrix pMii_out,pM0i_out; MyVector pM00_out; std::vector<MySpMatrix> pMii,pM0i; std::vector<MyVector> pM00; std::vector<MyMatrix> conv_ppMii, conv_ppM0i; std::vector<MyVector> conv_ppM00; std::vector<MyMatrix> conv_pMii, conv_pM0i; std::vector<MyVector> conv_pM00; // Convert the labels to one-hot vector MyMatrix one_hot = MyMatrix::Zero(n_training, n_label); labels2oneHot(Y_train,one_hot); // Configure the logger std::ostream* logger; if(args.find("verbose")!=args.end()){ getOutput("",logger); }else{ getOutput(params.file_path,logger); } double cumul_time = 0.; printDesc(params, logger); printConvDesc(params, conv_params, pool_params, logger); std::cout << "Starting the learning phase... " << std::endl; *logger << "Epoch Time(s) train_loss train_accuracy valid_loss valid_accuracy eta" << std::endl; for(unsigned i = 0; i < params.n_epoch; i++){ for(unsigned j = 0; j < n_train_batch; j++){ // Mini-batch creation unsigned curr_batch_size = 0; MyMatrix X_batch, one_hot_batch; getMiniBatch(j, params.train_minibatch_size, X_train, one_hot, params, conv_params[0], curr_batch_size, X_batch, one_hot_batch); double prev_time = gettime(); // Forward propagation for conv layer std::vector<std::vector<unsigned>> poolIdxX1(n_conv_layer); std::vector<std::vector<unsigned>> poolIdxY1(n_conv_layer); MyMatrix z0; std::vector<MyMatrix> conv_A(conv_W.size()); std::vector<MyMatrix> conv_Ap(conv_W.size()); convFprop(curr_batch_size, conv_params, pool_params, act_func, conv_W, conv_B, X_batch, conv_A, conv_Ap, z0, poolIdxX1, poolIdxY1); // Forward propagation std::vector<MyMatrix> Z(n_layers-1); std::vector<MyMatrix> A(n_layers-2); std::vector<MyMatrix> Ap(n_layers-2); fprop(params.dropout_flag, act_func, W, W_out, B, z0, Z, A, Ap); // Compute the output and the error MyMatrix out; softmax(Z[n_layers-2], out); std::vector<MyMatrix> gradB(n_layers-1); gradB[n_layers-2] = out - one_hot_batch; // Backpropagation bprop(Wt, W_out, Ap, gradB); // Backpropagation for conv layer std::vector<MyMatrix> conv_gradB(conv_W.size()); MyMatrix layer_gradB = (gradB[0] * W[0].transpose()); MyMatrix pool_gradB; layer2pool(curr_batch_size, pool_params[conv_W.size()-1].N, conv_params[conv_W.size()-1].n_filter, layer_gradB, pool_gradB); convBprop(curr_batch_size, conv_params, pool_params, conv_W_T, conv_Ap, pool_gradB, conv_gradB, poolIdxX1, poolIdxY1); if(params.algo == "bprop"){ update(eta, gradB, A, z0, params.regularizer, params.lambda, W_out, W, Wt, B); convUpdate(curr_batch_size, eta, conv_params, conv_gradB, conv_A, X_batch, "", 0., conv_W, conv_W_T, conv_B); }else{ // Compute the metric std::vector<MyMatrix> metric_gradB(n_layers-1); std::vector<MyMatrix> metric_conv_gradB(conv_params.size()); if(params.algo=="qdMCNat"){ // Monte-Carlo Approximation of the metric std::vector<MyMatrix> mc_gradB(n_layers-1); computeMcError(out, mc_gradB[n_layers-2]); // Backpropagation bprop(Wt, W_out, Ap, mc_gradB); for(unsigned k = 0; k < gradB.size(); k++){ metric_gradB[k] = mc_gradB[k].array().square(); } // Backpropagation for conv layer std::vector<MyMatrix> mc_conv_gradB(conv_W.size()); MyMatrix mc_layer_gradB = (mc_gradB[0] * W[0].transpose()); MyMatrix mc_pool_gradB; layer2pool(curr_batch_size, pool_params[conv_W.size()-1].N, conv_params[conv_W.size()-1].n_filter, mc_layer_gradB, mc_pool_gradB); convBprop(curr_batch_size, conv_params, pool_params, conv_W_T, conv_Ap, mc_pool_gradB, mc_conv_gradB, poolIdxX1, poolIdxY1); for(unsigned k = 0; k < conv_params.size(); k++){ metric_conv_gradB[k] = mc_conv_gradB[k].array().square(); } } else if(params.algo=="qdop"){ for(unsigned k = 0; k < conv_params.size(); k++){ metric_conv_gradB[k] = conv_gradB[k].array().square(); } for(unsigned k = 0; k < gradB.size(); k++){ metric_gradB[k] = gradB[k].array().square(); } } else if(params.algo=="qdNat"){ for(unsigned k = 0; k < conv_params.size(); k++){ metric_conv_gradB[k] = conv_gradB[k].array().square(); } for(unsigned k = 0; k < metric_gradB.size(); k++){ metric_gradB[k] = MyMatrix::Zero(gradB[k].rows(),gradB[k].cols()); } for(unsigned l = 0; l < n_label; l++){ MyMatrix fisher_ohbatch = MyMatrix::Zero(curr_batch_size, n_label); fisher_ohbatch.col(l).setOnes(); std::vector<MyMatrix> fgradB(n_layers-1); fgradB[n_layers-2] = out - fisher_ohbatch; bprop(Wt, W_out, Ap, fgradB); // Backpropagation for conv layer std::vector<MyMatrix> fisher_conv_gradB(conv_W.size()); MyMatrix fisher_layer_gradB = (fgradB[0] * W[0].transpose()); MyMatrix fisher_pool_gradB; layer2pool(curr_batch_size, pool_params[conv_W.size()-1].N, conv_params[conv_W.size()-1].n_filter, fisher_layer_gradB, fisher_pool_gradB); convBprop(curr_batch_size, conv_params, pool_params, conv_W_T, conv_Ap, fisher_pool_gradB, fisher_conv_gradB, poolIdxX1, poolIdxY1); for(unsigned k = 0; k < conv_params.size(); k++){ MyMatrix fisher_conv_gradB_sq = fisher_conv_gradB[k].array().square(); for(unsigned m = 0; m < out.rows(); m++){ for(unsigned f = 0; f < conv_params[k].n_filter; f++){ for(unsigned n = 0; n < conv_params[k].N * conv_params[k].N; n++){ fisher_conv_gradB_sq(f,m*conv_params[k].N*conv_params[k].N+n) *= out(m,l); } } } metric_conv_gradB[k] += fisher_conv_gradB_sq; } for(unsigned k = 0; k < W.size(); k++){ const unsigned rev_k = n_layers - k - 2; metric_gradB[rev_k] += (fgradB[rev_k].array().square().array().colwise() * out.array().col(l)).matrix(); } } } bool init_flag = false; if(i == 0 && j == 0 && !params.init_metric_id){ init_flag = true; } std::vector<MyMatrix> conv_Mii(conv_params.size()); std::vector<MyMatrix> conv_M0i(conv_params.size()); std::vector<MyVector> conv_M00(conv_params.size()); buildConvQDMetric(curr_batch_size, metric_conv_gradB, conv_A, X_batch, conv_W, params.matrix_reg, conv_Mii, conv_M0i, conv_M00); updateConvMetric(init_flag, params.metric_gamma, conv_pMii, conv_pM0i, conv_pM00, conv_Mii, conv_M0i, conv_M00); MyMatrix Mii_out, M0i_out; MyVector M00_out; std::vector<MySpMatrix> Mii(W.size()); std::vector<MySpMatrix> M0i(W.size()); std::vector<MyVector> M00(W.size()); buildQDMetric(metric_gradB, A, z0, W_out, W, params.matrix_reg, Mii_out, M0i_out, M00_out, Mii, M0i, M00); updateMetric(init_flag, params.metric_gamma, Mii_out, M0i_out, M00_out, Mii, M0i, M00, pMii_out, pM0i_out, pM00_out, pMii, pM0i, pM00); update(eta, gradB, A, z0, params.regularizer, params.lambda, W_out, W, Wt, B, Mii_out, M0i_out, M00_out, Mii, M0i, M00); } double curr_time = gettime(); cumul_time += curr_time - prev_time; if(params.minilog_flag){ double train_loss = 0.; double train_accuracy = 0.; double valid_loss = 0.; double valid_accuracy = 0.; evalModel(eval_act_func, params, n_train_batch, n_training, X_train, Y_train, conv_params, pool_params, conv_W, conv_B, W_out, W, B, train_loss, train_accuracy); evalModel(eval_act_func, params, n_valid_batch, n_valid, X_valid, Y_valid, conv_params, pool_params, conv_W, conv_B, W_out, W, B, valid_loss, valid_accuracy); // Logging *logger << i + float(j)/n_train_batch << " " << cumul_time << " " << train_loss << " " << train_accuracy << " " << valid_loss << " " << valid_accuracy << " " << eta << std::endl; } } if(!params.minilog_flag || params.adaptive_flag){ double train_loss = 0.; double train_accuracy = 0.; double valid_loss = 0.; double valid_accuracy = 0.; evalModel(eval_act_func, params, n_train_batch, n_training, X_train, Y_train, conv_params, pool_params, conv_W, conv_B, W_out, W, B, train_loss, train_accuracy); evalModel(eval_act_func, params, n_valid_batch, n_valid, X_valid, Y_valid, conv_params, pool_params, conv_W, conv_B, W_out, W, B, valid_loss, valid_accuracy); // if(params.adaptive_flag) // adaptiveRule(train_loss, prev_loss, eta, W, B, pMii, pM0i, pM00, pW, pB, ppMii, ppM0i, ppM00); // Logging if(!params.minilog_flag){ *logger << i << " " << cumul_time << " " << train_loss << " " << train_accuracy << " " << valid_loss << " " << valid_accuracy << " " << eta << std::endl; } } } }
void CG::operator()(cudaColorSpinorField &x, cudaColorSpinorField &b) { profile.Start(QUDA_PROFILE_INIT); // Check to see that we're not trying to invert on a zero-field source const double b2 = norm2(b); if(b2 == 0){ profile.Stop(QUDA_PROFILE_INIT); printfQuda("Warning: inverting on zero-field source\n"); x=b; param.true_res = 0.0; param.true_res_hq = 0.0; return; } cudaColorSpinorField r(b); ColorSpinorParam csParam(x); csParam.create = QUDA_ZERO_FIELD_CREATE; cudaColorSpinorField y(b, csParam); mat(r, x, y); // zeroCuda(y); double r2 = xmyNormCuda(b, r); csParam.setPrecision(param.precision_sloppy); cudaColorSpinorField Ap(x, csParam); cudaColorSpinorField tmp(x, csParam); cudaColorSpinorField *tmp2_p = &tmp; // tmp only needed for multi-gpu Wilson-like kernels if (mat.Type() != typeid(DiracStaggeredPC).name() && mat.Type() != typeid(DiracStaggered).name()) { tmp2_p = new cudaColorSpinorField(x, csParam); } cudaColorSpinorField &tmp2 = *tmp2_p; cudaColorSpinorField *x_sloppy, *r_sloppy; if (param.precision_sloppy == x.Precision()) { csParam.create = QUDA_REFERENCE_FIELD_CREATE; x_sloppy = &x; r_sloppy = &r; } else { csParam.create = QUDA_COPY_FIELD_CREATE; x_sloppy = new cudaColorSpinorField(x, csParam); r_sloppy = new cudaColorSpinorField(r, csParam); } cudaColorSpinorField &xSloppy = *x_sloppy; cudaColorSpinorField &rSloppy = *r_sloppy; cudaColorSpinorField p(rSloppy); if(&x != &xSloppy){ copyCuda(y,x); zeroCuda(xSloppy); }else{ zeroCuda(y); } const bool use_heavy_quark_res = (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) ? true : false; profile.Stop(QUDA_PROFILE_INIT); profile.Start(QUDA_PROFILE_PREAMBLE); double r2_old; double stop = b2*param.tol*param.tol; // stopping condition of solver double heavy_quark_res = 0.0; // heavy quark residual if(use_heavy_quark_res) heavy_quark_res = sqrt(HeavyQuarkResidualNormCuda(x,r).z); int heavy_quark_check = 10; // how often to check the heavy quark residual double alpha=0.0, beta=0.0; double pAp; int rUpdate = 0; double rNorm = sqrt(r2); double r0Norm = rNorm; double maxrx = rNorm; double maxrr = rNorm; double delta = param.delta; // this parameter determines how many consective reliable update // reisudal increases we tolerate before terminating the solver, // i.e., how long do we want to keep trying to converge int maxResIncrease = 0; // 0 means we have no tolerance profile.Stop(QUDA_PROFILE_PREAMBLE); profile.Start(QUDA_PROFILE_COMPUTE); blas_flops = 0; int k=0; PrintStats("CG", k, r2, b2, heavy_quark_res); int steps_since_reliable = 1; while ( !convergence(r2, heavy_quark_res, stop, param.tol_hq) && k < param.maxiter) { matSloppy(Ap, p, tmp, tmp2); // tmp as tmp double sigma; bool breakdown = false; if (param.pipeline) { double3 triplet = tripleCGReductionCuda(rSloppy, Ap, p); r2 = triplet.x; double Ap2 = triplet.y; pAp = triplet.z; r2_old = r2; alpha = r2 / pAp; sigma = alpha*(alpha * Ap2 - pAp); if (sigma < 0.0 || steps_since_reliable==0) { // sigma condition has broken down r2 = axpyNormCuda(-alpha, Ap, rSloppy); sigma = r2; breakdown = true; } r2 = sigma; } else { r2_old = r2; pAp = reDotProductCuda(p, Ap); alpha = r2 / pAp; // here we are deploying the alternative beta computation Complex cg_norm = axpyCGNormCuda(-alpha, Ap, rSloppy); r2 = real(cg_norm); // (r_new, r_new) sigma = imag(cg_norm) >= 0.0 ? imag(cg_norm) : r2; // use r2 if (r_k+1, r_k+1-r_k) breaks } // reliable update conditions rNorm = sqrt(r2); if (rNorm > maxrx) maxrx = rNorm; if (rNorm > maxrr) maxrr = rNorm; int updateX = (rNorm < delta*r0Norm && r0Norm <= maxrx) ? 1 : 0; int updateR = ((rNorm < delta*maxrr && r0Norm <= maxrr) || updateX) ? 1 : 0; // force a reliable update if we are within target tolerance (only if doing reliable updates) if ( convergence(r2, heavy_quark_res, stop, param.tol_hq) && delta >= param.tol) updateX = 1; if ( !(updateR || updateX)) { //beta = r2 / r2_old; beta = sigma / r2_old; // use the alternative beta computation if (param.pipeline && !breakdown) tripleCGUpdateCuda(alpha, beta, Ap, rSloppy, xSloppy, p); else axpyZpbxCuda(alpha, p, xSloppy, rSloppy, beta); if (use_heavy_quark_res && k%heavy_quark_check==0) { copyCuda(tmp,y); heavy_quark_res = sqrt(xpyHeavyQuarkResidualNormCuda(xSloppy, tmp, rSloppy).z); } steps_since_reliable++; } else { axpyCuda(alpha, p, xSloppy); if (x.Precision() != xSloppy.Precision()) copyCuda(x, xSloppy); xpyCuda(x, y); // swap these around? mat(r, y, x); // here we can use x as tmp r2 = xmyNormCuda(b, r); if (x.Precision() != rSloppy.Precision()) copyCuda(rSloppy, r); zeroCuda(xSloppy); // break-out check if we have reached the limit of the precision static int resIncrease = 0; if (sqrt(r2) > r0Norm && updateX) { // reuse r0Norm for this warningQuda("CG: new reliable residual norm %e is greater than previous reliable residual norm %e", sqrt(r2), r0Norm); k++; rUpdate++; if (++resIncrease > maxResIncrease) break; } else { resIncrease = 0; } rNorm = sqrt(r2); maxrr = rNorm; maxrx = rNorm; r0Norm = rNorm; rUpdate++; // explicitly restore the orthogonality of the gradient vector double rp = reDotProductCuda(rSloppy, p) / (r2); axpyCuda(-rp, rSloppy, p); beta = r2 / r2_old; xpayCuda(rSloppy, beta, p); if(use_heavy_quark_res) heavy_quark_res = sqrt(HeavyQuarkResidualNormCuda(y,r).z); steps_since_reliable = 0; } breakdown = false; k++; PrintStats("CG", k, r2, b2, heavy_quark_res); } if (x.Precision() != xSloppy.Precision()) copyCuda(x, xSloppy); xpyCuda(y, x); profile.Stop(QUDA_PROFILE_COMPUTE); profile.Start(QUDA_PROFILE_EPILOGUE); param.secs = profile.Last(QUDA_PROFILE_COMPUTE); double gflops = (quda::blas_flops + mat.flops() + matSloppy.flops())*1e-9; reduceDouble(gflops); param.gflops = gflops; param.iter += k; if (k==param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter); if (getVerbosity() >= QUDA_VERBOSE) printfQuda("CG: Reliable updates = %d\n", rUpdate); // compute the true residuals mat(r, x, y); param.true_res = sqrt(xmyNormCuda(b, r) / b2); #if (__COMPUTE_CAPABILITY__ >= 200) param.true_res_hq = sqrt(HeavyQuarkResidualNormCuda(x,r).z); #else param.true_res_hq = 0.0; #endif PrintSummary("CG", k, r2, b2); // reset the flops counters quda::blas_flops = 0; mat.flops(); matSloppy.flops(); profile.Stop(QUDA_PROFILE_EPILOGUE); profile.Start(QUDA_PROFILE_FREE); if (&tmp2 != &tmp) delete tmp2_p; if (param.precision_sloppy != x.Precision()) { delete r_sloppy; delete x_sloppy; } profile.Stop(QUDA_PROFILE_FREE); return; }
//============================================================================= int Amesos_Dscpack::PerformSymbolicFactorization() { ResetTimer(0); ResetTimer(1); MyPID_ = Comm().MyPID(); NumProcs_ = Comm().NumProc(); Epetra_RowMatrix *RowMatrixA = Problem_->GetMatrix(); if (RowMatrixA == 0) AMESOS_CHK_ERR(-1); const Epetra_Map& OriginalMap = RowMatrixA->RowMatrixRowMap() ; const Epetra_MpiComm& comm1 = dynamic_cast<const Epetra_MpiComm &> (Comm()); int numrows = RowMatrixA->NumGlobalRows(); int numentries = RowMatrixA->NumGlobalNonzeros(); Teuchos::RCP<Epetra_CrsGraph> Graph; Epetra_CrsMatrix* CastCrsMatrixA = dynamic_cast<Epetra_CrsMatrix*>(RowMatrixA); if (CastCrsMatrixA) { Graph = Teuchos::rcp(const_cast<Epetra_CrsGraph*>(&(CastCrsMatrixA->Graph())), false); } else { int MaxNumEntries = RowMatrixA->MaxNumEntries(); Graph = Teuchos::rcp(new Epetra_CrsGraph(Copy, OriginalMap, MaxNumEntries)); std::vector<int> Indices(MaxNumEntries); std::vector<double> Values(MaxNumEntries); for (int i = 0 ; i < RowMatrixA->NumMyRows() ; ++i) { int NumEntries; RowMatrixA->ExtractMyRowCopy(i, MaxNumEntries, NumEntries, &Values[0], &Indices[0]); for (int j = 0 ; j < NumEntries ; ++j) Indices[j] = RowMatrixA->RowMatrixColMap().GID(Indices[j]); int GlobalRow = RowMatrixA->RowMatrixRowMap().GID(i); Graph->InsertGlobalIndices(GlobalRow, NumEntries, &Indices[0]); } Graph->FillComplete(); } // // Create a replicated map and graph // std::vector<int> AllIDs( numrows ) ; for ( int i = 0; i < numrows ; i++ ) AllIDs[i] = i ; Epetra_Map ReplicatedMap( -1, numrows, &AllIDs[0], 0, Comm()); Epetra_Import ReplicatedImporter(ReplicatedMap, OriginalMap); Epetra_CrsGraph ReplicatedGraph( Copy, ReplicatedMap, 0 ); AMESOS_CHK_ERR(ReplicatedGraph.Import(*Graph, ReplicatedImporter, Insert)); AMESOS_CHK_ERR(ReplicatedGraph.FillComplete()); // // Convert the matrix to Ap, Ai // std::vector <int> Replicates(numrows); std::vector <int> Ap(numrows + 1); std::vector <int> Ai(EPETRA_MAX(numrows, numentries)); for( int i = 0 ; i < numrows; i++ ) Replicates[i] = 1; int NumEntriesPerRow ; int *ColIndices = 0 ; int Ai_index = 0 ; for ( int MyRow = 0; MyRow <numrows; MyRow++ ) { AMESOS_CHK_ERR( ReplicatedGraph.ExtractMyRowView( MyRow, NumEntriesPerRow, ColIndices ) ); Ap[MyRow] = Ai_index ; for ( int j = 0; j < NumEntriesPerRow; j++ ) { Ai[Ai_index] = ColIndices[j] ; Ai_index++; } } assert( Ai_index == numentries ) ; Ap[ numrows ] = Ai_index ; MtxConvTime_ = AddTime("Total matrix conversion time", MtxConvTime_, 0); ResetTimer(0); // // Call Dscpack Symbolic Factorization // int OrderCode = 2; std::vector<double> MyANonZ; NumLocalNonz = 0 ; GlobalStructNewColNum = 0 ; GlobalStructNewNum = 0 ; GlobalStructOwner = 0 ; LocalStructOldNum = 0 ; NumGlobalCols = 0 ; // MS // Have to define the maximum number of processes to be used // MS // This is only a suggestion as Dscpack uses a number of processes that is a power of 2 int NumGlobalNonzeros = GetProblem()->GetMatrix()->NumGlobalNonzeros(); int NumRows = GetProblem()->GetMatrix()->NumGlobalRows(); // optimal value for MaxProcs == -1 int OptNumProcs1 = 1+EPETRA_MAX( NumRows/10000, NumGlobalNonzeros/1000000 ); OptNumProcs1 = EPETRA_MIN(NumProcs_,OptNumProcs1 ); // optimal value for MaxProcs == -2 int OptNumProcs2 = (int)sqrt(1.0 * NumProcs_); if( OptNumProcs2 < 1 ) OptNumProcs2 = 1; // fix the value of MaxProcs switch (MaxProcs_) { case -1: MaxProcs_ = EPETRA_MIN(OptNumProcs1, NumProcs_); break; case -2: MaxProcs_ = EPETRA_MIN(OptNumProcs2, NumProcs_); break; case -3: MaxProcs_ = NumProcs_; break; } #if 0 if (MyDscRank>=0 && A_and_LU_built) { DSC_ReFactorInitialize(PrivateDscpackData_->MyDSCObject); } #endif // if ( ! A_and_LU_built ) { // DSC_End( PrivateDscpackData_->MyDSCObject ) ; // PrivateDscpackData_->MyDSCObject = DSC_Begin() ; // } // MS // here I continue with the old code... OverheadTime_ = AddTime("Total Amesos overhead time", OverheadTime_, 1); DscNumProcs = 1 ; int DscMax = DSC_Analyze( numrows, &Ap[0], &Ai[0], &Replicates[0] ); while ( DscNumProcs * 2 <=EPETRA_MIN( MaxProcs_, DscMax ) ) DscNumProcs *= 2 ; MyDscRank = -1; DSC_Open0( PrivateDscpackData_->MyDSCObject_, DscNumProcs, &MyDscRank, comm1.Comm()) ; NumLocalCols = 0 ; // This is for those processes not in the Dsc grid if ( MyDscRank >= 0 ) { assert( MyPID_ == MyDscRank ) ; AMESOS_CHK_ERR( DSC_Order ( PrivateDscpackData_->MyDSCObject_, OrderCode, numrows, &Ap[0], &Ai[0], &Replicates[0], &NumGlobalCols, &NumLocalStructs, &NumLocalCols, &NumLocalNonz, &GlobalStructNewColNum, &GlobalStructNewNum, &GlobalStructOwner, &LocalStructOldNum ) ) ; assert( NumGlobalCols == numrows ) ; assert( NumLocalCols == NumLocalStructs ) ; } if ( MyDscRank >= 0 ) { int MaxSingleBlock; const int Limit = 5000000 ; // Memory Limit set to 5 Terabytes AMESOS_CHK_ERR( DSC_SFactor ( PrivateDscpackData_->MyDSCObject_, &TotalMemory_, &MaxSingleBlock, Limit, DSC_LBLAS3, DSC_DBLAS2 ) ) ; } // A_and_LU_built = true; // If you uncomment this, TestOptions fails SymFactTime_ = AddTime("Total symbolic factorization time", SymFactTime_, 0); return(0); }
void cgsolve( const ParallelDataMap data_map , const CrsMatrix<AScalarType,Device> A , const View<VScalarType*,Device> b , const View<VScalarType*,Device> x , size_t & iteration , double & normr , double & iter_time , const size_t maximum_iteration = 200 , const double tolerance = std::numeric_limits<VScalarType>::epsilon() ) { typedef View<VScalarType*,Device> vector_type ; typedef View<VScalarType, Device> value_type ; const size_t count_owned = data_map.count_owned ; const size_t count_total = data_map.count_owned + data_map.count_receive ; Operator<AScalarType,VScalarType,Device> matrix_operator( data_map , A ); // Need input vector to matvec to be owned + received vector_type pAll ( "cg::p" , count_total ); vector_type p = Kokkos::subview< vector_type >( pAll , std::pair<size_t,size_t>(0,count_owned) ); vector_type r ( "cg::r" , count_owned ); vector_type Ap( "cg::Ap", count_owned ); /* r = b - A * x ; */ /* p = x */ deep_copy( p , x ); /* Ap = A * p */ matrix_operator.apply( pAll , Ap ); /* r = b - Ap */ waxpby( count_owned , 1.0 , b , -1.0 , Ap , r ); /* p = r */ deep_copy( p , r ); double old_rdot = dot( count_owned , r , data_map.machine ); normr = sqrt( old_rdot ); iteration = 0 ; Kokkos::Impl::Timer wall_clock ; while ( tolerance < normr && iteration < maximum_iteration ) { /* pAp_dot = dot( p , Ap = A * p ) */ /* Ap = A * p */ matrix_operator.apply( pAll , Ap ); const double pAp_dot = dot( count_owned , p , Ap , data_map.machine ); const double alpha = old_rdot / pAp_dot ; /* x += alpha * p ; */ axpy( count_owned, alpha, p , x ); /* r -= alpha * Ap ; */ axpy( count_owned, -alpha, Ap, r ); const double r_dot = dot( count_owned , r , data_map.machine ); const double beta = r_dot / old_rdot ; /* p = r + beta * p ; */ xpby( count_owned , r , beta , p ); normr = sqrt( old_rdot = r_dot ); ++iteration ; } iter_time = wall_clock.seconds(); }
ordinal_type Stokhos::CGDivisionExpansionStrategy<ordinal_type,value_type,node_type>:: CG(const Teuchos::SerialDenseMatrix<ordinal_type, value_type> & A, Teuchos::SerialDenseMatrix<ordinal_type,value_type> & X, const Teuchos::SerialDenseMatrix<ordinal_type,value_type> & B, ordinal_type max_iter, value_type tolerance, ordinal_type prec_iter, ordinal_type order , ordinal_type m, ordinal_type PrecNum, const Teuchos::SerialDenseMatrix<ordinal_type, value_type> & M, ordinal_type diag) { ordinal_type n = A.numRows(); ordinal_type k=0; value_type resid; Teuchos::SerialDenseMatrix<ordinal_type, value_type> Ax(n,1); Ax.multiply(Teuchos::NO_TRANS,Teuchos::NO_TRANS,1.0, A, X, 0.0); Teuchos::SerialDenseMatrix<ordinal_type, value_type> r(Teuchos::Copy,B); r-=Ax; resid=r.normFrobenius(); Teuchos::SerialDenseMatrix<ordinal_type, value_type> p(r); Teuchos::SerialDenseMatrix<ordinal_type, value_type> rho(1,1); Teuchos::SerialDenseMatrix<ordinal_type, value_type> oldrho(1,1); Teuchos::SerialDenseMatrix<ordinal_type, value_type> pAp(1,1); Teuchos::SerialDenseMatrix<ordinal_type, value_type> Ap(n,1); value_type b; value_type a; while (resid > tolerance && k < max_iter){ Teuchos::SerialDenseMatrix<ordinal_type, value_type> z(r); //Solve Mz=r if (PrecNum != 0){ if (PrecNum == 1){ Stokhos::DiagPreconditioner<ordinal_type, value_type> precond(M); precond.ApplyInverse(r,z,prec_iter); } else if (PrecNum == 2){ Stokhos::JacobiPreconditioner<ordinal_type, value_type> precond(M); precond.ApplyInverse(r,z,2); } else if (PrecNum == 3){ Stokhos::GSPreconditioner<ordinal_type, value_type> precond(M,0); precond.ApplyInverse(r,z,1); } else if (PrecNum == 4){ Stokhos::SchurPreconditioner<ordinal_type, value_type> precond(M, order, m, diag); precond.ApplyInverse(r,z,prec_iter); } } rho.multiply(Teuchos::TRANS,Teuchos::NO_TRANS,1.0, r, z, 0.0); if (k==0){ p.assign(z); rho.multiply(Teuchos::TRANS, Teuchos::NO_TRANS, 1.0, r, z, 0.0); } else { b=rho(0,0)/oldrho(0,0); p.scale(b); p+=z; } Ap.multiply(Teuchos::NO_TRANS,Teuchos::NO_TRANS,1.0, A, p, 0.0); pAp.multiply(Teuchos::TRANS,Teuchos::NO_TRANS,1.0, p, Ap, 0.0); a=rho(0,0)/pAp(0,0); Teuchos::SerialDenseMatrix<ordinal_type, value_type> scalep(p); scalep.scale(a); X+=scalep; Ap.scale(a); r-=Ap; oldrho.assign(rho); resid=r.normFrobenius(); k++; } //std::cout << "iteration count " << k << std::endl; return 0; }