static void Tz_L2L(FmmvHandle *FMMV, _FLOAT_ *x) { int p = FMMV->pL; _FLOAT_ **Tz = FMMV->Tz_L2L; int k, dk, kk; if (FMMV->beta==0) { dk = p+1; kk = p+1; tpmv_upper(dk, Tz[0], x); for (k=1; k<=p; k++) { dk--; tpmv_upper(dk, Tz[k], x + kk); kk += dk; tpmv_upper(dk, Tz[k], x + kk); kk += dk; } } else { _FLOAT_ y[(FMM_P_MAX+1)*(FMM_P_MAX+2)]; dk = p+1; kk = p+1; gemv(dk, dk, Tz[0], dk, x, y); for (k=1; k<=p; k++) { dk--; gemv(dk, dk, Tz[k], dk, x + kk, y + kk); kk += dk; gemv(dk, dk, Tz[k], dk, x + kk, y + kk); kk += dk; } memcpy(x, y, sizeof(_FLOAT_)*(p+1)*(p+2)); } }
void Ry(int p, _FLOAT_ **blocks, _FLOAT_ *x, _FLOAT_ *y) { int k,j, j1; gemv(1, 1, blocks[0], 1, x, y); k = 1; for (j=1; j<=p; j++) { j1 = j+1; gemv(j1, j1, blocks[2*j], j1, x+k, y+k); k += j1; gemv(j, j, blocks[2*j+1], j, x+k, y+k); k += j; } }
void gemv( matrix_expression<MatA, cpu_tag> const &A, vector_expression<VectorX, cpu_tag> const &x, vector_expression<VectorY, cpu_tag> &y, typename VectorY::value_type alpha, boost::mpl::true_ ){ std::size_t m = A().size1(); std::size_t n = A().size2(); SIZE_CHECK(x().size() == A().size2()); SIZE_CHECK(y().size() == A().size1()); CBLAS_ORDER const stor_ord= (CBLAS_ORDER)storage_order<typename MatA::orientation>::value; auto storageA = A().raw_storage(); auto storagex = x().raw_storage(); auto storagey = y().raw_storage(); gemv(stor_ord, CblasNoTrans, (int)m, (int)n, alpha, storageA.values, storageA.leading_dimension, storagex.values, storagex.stride, typename VectorY::value_type(1), storagey.values, storagey.stride ); }
void gemv( matrix_expression<MatrA> const &A, vector_expression<VectorX> const &x, vector_expression<VectorY> &y, typename VectorY::value_type alpha, boost::mpl::true_ ){ std::size_t m = A().size1(); std::size_t n = A().size2(); SIZE_CHECK(x().size() == A().size2()); SIZE_CHECK(y().size() == A().size1()); CBLAS_ORDER const stor_ord= (CBLAS_ORDER)storage_order<typename MatrA::orientation>::value; gemv(stor_ord, CblasNoTrans, (int)m, (int)n, alpha, traits::storage(A), traits::leading_dimension(A), traits::storage(x), traits::stride(x), typename VectorY::value_type(1), traits::storage(y), traits::stride(y) ); }
scalar* gemm(int m,int n,int k,scalar* A, scalar* B,scalar* AB){ int i; AB=AB?AB:alloc_data(m*k); i=k; while(i--){ gemv(m,n,A,B+i,k,AB+i,k); } /* AB[*i]=A*B[*i] */ return AB; }
void bi::mean(const M1 X, V1 mu) { /* pre-condition */ BI_ASSERT(X.size2() == mu.size()); const int N = X.size1(); typename sim_temp_vector<V1>::type w(N); set_elements(w, 1.0); gemv(1.0/N, X, w, 0.0, mu, 'T'); }
void bi::marginalise(const ExpGaussianPdf<V1, M1>& p1, const ExpGaussianPdf<V2,M2>& p2, const M3 C, const ExpGaussianPdf<V4, M4>& q2, ExpGaussianPdf<V5,M5>& p3) { /* pre-conditions */ BI_ASSERT(q2.size() == p2.size()); BI_ASSERT(p3.size() == p1.size()); BI_ASSERT(C.size1() == p1.size() && C.size2() == p2.size()); typename sim_temp_vector<V1>::type z2(p2.size()); typename sim_temp_matrix<M1>::type K(p1.size(), p2.size()); typename sim_temp_matrix<M1>::type A1(p2.size(), p2.size()); typename sim_temp_matrix<M1>::type A2(p2.size(), p2.size()); /** * Compute gain matrix: * * \f[\mathcal{K} = C_{\mathbf{x}_1,\mathbf{x}_2}\Sigma_2^{-1}\,.\f] */ symm(1.0, p2.prec(), C, 0.0, K, 'R', 'U'); /** * Then result is given by \f$\mathcal{N}(\boldsymbol{\mu}', * \Sigma')\f$, where: * * \f[\boldsymbol{\mu}' = \boldsymbol{\mu}_1 + * \mathcal{K}(\boldsymbol{\mu}_3 - \boldsymbol{\mu}_2)\,,\f] */ z2 = q2.mean(); axpy(-1.0, p2.mean(), z2); p3.mean() = p1.mean(); gemv(1.0, K, z2, 1.0, p3.mean()); /** * and: * * \f{eqnarray*} * \Sigma' &=& \Sigma_1 + \mathcal{K}(\Sigma_3 - * \Sigma_2)\mathcal{K}^T \\ * &=& \Sigma_1 + \mathcal{K}\Sigma_3\mathcal{K}^T - * \mathcal{K}\Sigma_2\mathcal{K}^T\,. * \f} */ p3.cov() = p1.cov(); A1 = K; trmm(1.0, q2.std(), A1, 'R', 'U', 'T'); syrk(1.0, A1, 1.0, p3.cov(), 'U'); A2 = K; trmm(1.0, p2.std(), A2, 'R', 'U', 'T'); syrk(-1.0, A2, 1.0, p3.cov(), 'U'); /* make sure correct log-variables set */ p3.setLogs(p2.getLogs()); p3.init(); // redo precalculations }
void bi::mean(const M1 X, const V1 w, V2 mu) { /* pre-conditions */ BI_ASSERT(X.size2() == mu.size()); BI_ASSERT(X.size1() == w.size()); typedef typename V1::value_type T; T Wt = sum_reduce(w); gemv(1.0/Wt, X, w, 0.0, mu, 'T'); }
bool mv(RealVector &y, const RealMatrix &A, const RealVector &x) { /// y = A*x + y bool flag = true; if (NULL == &A || NULL == &y || NULL == &x) { flag = false; goto end; } if (A.col != x.size || A.row != y.size) { flag = false; goto end; } gemv(y, 1, 0, A, x); end: return flag; }
void bi::condition(const ExpGaussianPdf<V1, M1>& p1, const ExpGaussianPdf<V2, M2>& p2, const M3 C, const V3 x2, ExpGaussianPdf<V4, M4>& p3) { /* pre-condition */ BI_ASSERT(x2.size() == p2.size()); BI_ASSERT(p3.size() == p1.size()); BI_ASSERT(C.size1() == p1.size() && C.size2() == p2.size()); typename sim_temp_vector<V1>::type z2(p2.size()); typename sim_temp_matrix<M1>::type K(p1.size(), p2.size()); /** * Compute gain matrix: * * \f[\mathcal{K} = C_{\mathbf{x}_1,\mathbf{x}_2}\Sigma_2^{-1}\,.\f] */ symm(1.0, p2.prec(), C, 0.0, K, 'R', 'U'); /** * Then result is given by \f$\mathcal{N}(\boldsymbol{\mu}', * \Sigma')\f$, where: * * \f[\boldsymbol{\mu}' = \boldsymbol{\mu}_1 + \mathcal{K}(\mathbf{x}_2 - * \boldsymbol{\mu}_2)\,,\f] */ z2 = x2; log_vector(z2, p2.getLogs()); axpy(-1.0, p2.mean(), z2); p3.mean() = p1.mean(); gemv(1.0, K, z2, 1.0, p3.mean()); /** * and: * * \f{eqnarray*} * \Sigma' &=& \Sigma_1 - \mathcal{K}C_{\mathbf{x}_1,\mathbf{x}_2}^T \\ * &=& \Sigma_1 - C_{\mathbf{x}_1,\mathbf{x}_2}\Sigma_2^{-1} * C_{\mathbf{x}_1,\mathbf{x}_2}^T\,.\f} */ K = C; trsm(1.0, p2.std(), K, 'R', 'U'); p3.cov() = p1.cov(); syrk(-1.0, K, 1.0, p3.cov(), 'U'); /* update log-variables and precalculations */ p3.setLogs(p1.getLogs()); p3.init(); }
template<class T> void gemv_check () { Matrix<T> A = rand<T> (4,4); Matrix<T> b = rand<T> (4,1); #ifdef VERBOSE std::cout << "A=\n" << A; std::cout << "b=\n" << b; #endif A = gemv(A,b); #ifdef VERBOSE std::cout << "A*b=\n" << A << std::endl; #endif }
void bidiag_gkl_restart( int locked, int l, int n, CAX && Ax, CATX && Atx, CD && D, CE && E, CRho && rho, CP && P, CQ && Q, int s_indx, int t_s_indx) { // enhancements version from SLEPc const double eta = 1.e-10; double t_start = 0.0, t_end = 0.0; double local_start = 0.0, local_end = 0.0; double t_total3 = 0.0, t_total4 = 0.0, t_total5 = 0.0, t_total6 = 0.0, t_total7 = 0.0; int rank, nprocs; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); // Step 1 int recv_len = (int)P.dim0() * nprocs; vec_container<double> tmp(Ax.dim0()); vec_container<double> recv_tmp(recv_len); auto m_Ax = make_gemv_ax(&Ax); auto m_Atx = make_gemv_ax(&Atx); m_Ax(Q.col(l), tmp, P.dim0() > 1000); vec_container<double> send_data(P.dim0(),0); for(size_t i = s_indx; i < s_indx + Ax.dim0(); ++i) send_data[i] = tmp.get(i-s_indx); MPI_Gather(&send_data[0], P.dim0(), MPI_DOUBLE, &recv_tmp[0], P.dim0(), MPI_DOUBLE, 0, MPI_COMM_WORLD); P.col(l) = 0; // Generate truly P.col(l) if(rank == 0) { local_union(P, recv_tmp, l, nprocs); // Step 2 & also in rank 0 for (int j = locked; j < l; ++j) { P.col(l) += -rho(j) * P.col(j); } } MPI_Bcast(&(P.col(0)[0]), P.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD); //MPI_Bcast(&(P.col(l)[0]), P.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD); // Main loop vec_container<double> T(n); int recv_l = Q.dim0() * nprocs; vec_container<double> recv_t(recv_l); for (int j = l; j < n; ++j) { // Step 3 vec_container<double> tmp2(Atx.dim0()); /* for print */ if(rank == 0) t_start = currenttime(); local_start = currenttime(); m_Atx(P.col(j), tmp2, Q.dim0() > 1000); local_end = currenttime(); std::cout << "parallel mv time cost is " << (local_end - local_start) / 1.0e6 << std::endl; vec_container<double> s_data(Q.dim0(), 0); for(size_t i = t_s_indx; i < t_s_indx + Atx.dim0(); ++i) s_data[i] = tmp2[i-t_s_indx]; MPI_Gather(&s_data[0], Q.dim0(), MPI_DOUBLE, &recv_t[0], Q.dim0(), MPI_DOUBLE, 0, MPI_COMM_WORLD); local_start = currenttime(); std::cout << "parallel mv time cost2 is " << (local_start - local_end) / 1.0e6 << std::endl; //Q.col(j+1) = 0; if(rank == 0) { // Generate truly Q.col(j+1) local_union(Q, recv_t, j + 1, nprocs); local_end = currenttime(); t_end = currenttime(); std::cout << "parallel mv time cost3 is " << (local_end - local_start) / 1.0e6 << std::endl; std::cout << "time of step 3 is : " << (t_end - t_start) / 1.0e6 << std::endl; t_total3 += (t_end - t_start) / 1.0e6; } // Step 4 for(size_t aa = 0; aa < Q.dim0(); ++aa) // row MPI_Bcast(&(Q.row(aa)[0]), j + 2, MPI_DOUBLE, 0, MPI_COMM_WORLD); // MPI_Bcast(&(Q.col(0)[0]), Q.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD); if(rank == 0) t_end = currenttime(); auto Qj = mat_cols(Q, 0, j + 1); auto Tj = make_vec(&T, j + 1); //Tj.assign(gemv(Qj.trans(), Q.col(j + 1)), j >= 3); parallel_gemv_task(Qj.trans(), Q.col(j+1), Tj); if(rank == 0) { t_start = currenttime(); t_total4 += (t_start - t_end) / 1.0e6; std::cout << "time of step 4 is : " << (t_start - t_end) / 1.0e6 << std::endl; } // Step 5 if(rank == 0) { double r = Q.col(j + 1).norm2(); D[j] = vec_unit(P.col(j)); Q.col(j + 1).scale(1. / D[j]); Tj = Tj / D[j]; r /= D[j]; Q.col(j + 1).plus_assign(- gemv(Qj, Tj), Q.dim0() > 1000); t_end = currenttime(); t_total5 += (t_end - t_start) / 1.0e6; std::cout << "time of step 5 is : " << (t_end - t_start) / 1.0e6 << std::endl; // Step 6 double beta = r * r - Tj.square_sum(); if (beta < eta * r * r) { Tj.assign(gemv(Qj.trans(), Q.col(j + 1)), Q.dim0() > 1000); r = Q.col(j + 1).square_sum(); Q.col(j + 1).plus_assign(-gemv(Qj, Tj), Q.dim0() > 1000); beta = r * r - Tj.square_sum(); } beta = std::sqrt(beta); E[j] = beta; Q.col(j + 1).scale(1. / E[j]); t_start = currenttime(); t_total6 += (t_start - t_end) / 1.0e6; std::cout << "time of step 6 is : " << (t_start - t_end) / 1.0e6 << std::endl; } // Step 7 // MPI_Bcast(&(Q.col(j+1)[0]), Q.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD); // MPI_Bcast(&(Q.col(0)[0]), Q.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD); for(size_t aa = 0; aa < Q.dim0(); ++aa) MPI_Bcast(&(Q.col(j+1)[aa]), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); if (j + 1 < n) { if(rank == 0) t_start = currenttime(); vec_container<double> tmp3(Ax.dim0()); vec_container<double> se_data(P.dim0(), 0); m_Ax(Q.col(j + 1), tmp3, P.dim0() > 1000); for(size_t k1 = s_indx; k1 < s_indx + Ax.dim0(); ++k1) se_data[k1] = tmp3[k1-s_indx]; MPI_Gather(&se_data[0], P.dim0(), MPI_DOUBLE, &recv_tmp[0], P.dim0(), MPI_DOUBLE, 0, MPI_COMM_WORLD); // P.col(j+1) = 0; if(rank == 0) { local_union(P, recv_tmp, j + 1, nprocs); P.col(j + 1).plus_assign(- E[j] * P.col(j), P.dim0() > 1000); } /* for print */ if(rank == 0) { t_end = currenttime(); t_total7 += (t_end - t_start) / 1.0e6; std::cout << "time of step 7 is : " << (t_end - t_start) / 1.0e6 << std::endl; } // MPI_Bcast(&(P.col(l)[0]), P.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD); // MPI_Bcast(&(P.col(0)[0]), P.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD); for(size_t aa = 0; aa < P.dim0(); ++aa) MPI_Bcast(&(P.col(j+1)[aa]), 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); } // end if } // end while /* for print time of each step. */ if(rank == 0) { std::cout << "total step 3 time is : " << t_total3 << std::endl; std::cout << "total step 4 time is : " << t_total4 << std::endl; std::cout << "total step 5 time is : " << t_total5 << std::endl; std::cout << "total step 6 time is : " << t_total6 << std::endl; std::cout << "total step 7 time is : " << t_total7 << std::endl; } return ; }
static int ATL_trmvLT ( const enum ATLAS_DIAG Diag, const int nb, ATL_CINT N, const TYPE *A, ATL_CINT lda, TYPE *X, ATL_CINT incX ) /* * RETURNS: 0 if TRMV was performed, non-zero if nothing done */ { static void (*trmvK)(ATL_CINT, const TYPE*, ATL_CINT, const TYPE*, TYPE*); void (*gemv)(ATL_CINT, ATL_CINT, const SCALAR, const TYPE*, ATL_CINT, const TYPE*, ATL_CINT, const SCALAR, TYPE*, ATL_CINT); void *vp; TYPE *x, *y; const size_t opsize = (N*N+N+N)*sizeof(TYPE)SHIFT; size_t t0; #ifdef TCPLX size_t N2=N+N, lda2 = lda+lda; TYPE one[2] = {ATL_rone, ATL_rzero}; #else #define N2 N #define lda2 lda #define one ATL_rone #endif const size_t incA = ((size_t)lda+1)*(nb SHIFT); ATL_CINT Nnb = ((N-1)/nb)*nb, Nr = N-Nnb; ATL_INT j; if (N < nb+nb) return(1); if (opsize > MY_CE) gemv = Mjoin(PATL,gemvT); else gemv = (opsize <= ATL_MulBySize(ATL_L1elts)) ? Mjoin(PATL,gemvT_L1) : Mjoin(PATL,gemvT_L2); trmvK = (Diag == AtlasNonUnit) ? ATL_trmvLTNk : ATL_trmvLTUk; /* * If X is aligned to Cachelen wt inc=1, use it as y */ t0 = (size_t) X; if (incX == 1 && (ATL_MulByCachelen(ATL_DivByCachelen(t0)) == t0)) { ATL_INT i; vp = malloc(ATL_Cachelen+ATL_MulBySize(N)); if (!vp) return(2); x = ATL_AlignPtr(vp); y = X; for (i=0; i < N2; i++) { x[i] = X[i]; X[i] = ATL_rzero; } } else /* allocate both X and Y */ { vp = malloc((ATL_Cachelen+ATL_MulBySize(N))<<1); if (!vp) return(3); x = ATL_AlignPtr(vp); y = x + N2; y = ATL_AlignPtr(y); Mjoin(PATL,copy)(N, X, incX, x, 1); Mjoin(PATL,zero)(N, y, 1); } for (j=0; j < Nnb; j += nb, A += incA) { #ifdef TCPLX const register size_t j2=j+j, nb2=nb+nb; #else #define j2 j #define nb2 nb #endif trmvK(nb, A, lda, x+j2, y+j2); gemv(N-j-nb, nb, one, A+nb2, lda, x+j2+nb2, 1, one, y+j2, 1); #ifndef TCPLX #undef j2 #undef nb2 #endif } #ifdef TCPLX j += j; #endif trmvK(Nr, A, lda, x+j, y+j); if (y != X) Mjoin(PATL,copy)(N, y, 1, X, incX); free(vp); return(0); }
int local(Trial &T, TBox &box, TBox &domain, double eps_cl, double *mgr, Global &glob, int axis, RCRVector x_av #ifdef NLOPT_UTIL_H , nlopt_stopping *stop #endif ) { int n=box.GetDim(); RVector x(n); double tmp, f; x=T.xvals ; #ifdef LS_DEBUG cout << "Local Search, x=" << x << endl; #endif if (box.OutsideBox(x, domain) != 0) { cout << "Starting point is not inside the boundary. Exiting...\n" ; exit(1) ; return LS_Out ; } // Check if we are close to a stationary point located previously if (box.CloseToMin(x, &tmp, eps_cl)) { #ifdef LS_DEBUG cout << "Close to a previously located stationary point, exiting" << endl; #endif T.objval=tmp; return LS_Old ; } #if 0 if (axis != -1) { cout << "NLopt code only works with axis == -1, exiting...\n" ; exit(EXIT_FAILURE); } f_local_data data; data.glob = &glob; data.maxgrad = *mgr; data.stop = stop; nlopt_result ret = nlopt_minimize(NLOPT_LOCAL_LBFGS, n, f_local, &data, box.lb.raw_data(), box.ub.raw_data(), x.raw_data(), &f, stop->minf_max, stop->ftol_rel, stop->ftol_abs, stop->xtol_rel, stop->xtol_abs, stop->maxeval - stop->nevals, stop->maxtime - stop->start); *mgr = data.maxgrad; T.xvals=x ; T.objval=f ; if (ret == NLOPT_MAXEVAL_REACHED || ret == NLOPT_MAXTIME_REACHED) return LS_MaxEvalTime; else if (ret > 0) return LS_New; else return LS_Out; // failure #else /* not using NLopt local optimizer ... use original STOgo BFGS code */ int k_max, info, outside = 0; int k, i, good_enough, iTmp ; double maxgrad, delta, f_new; double alpha, gamma, beta, d2, s2, nom, den, ro ; double nrm_sd, nrm_hn, snrm_hn, nrm_dl ; RVector g(n), h_sd(n), h_dl(n), h_n(n), x_new(n), g_new(n) ; RVector s(n),y(n),z(n),w(n) ; // Temporary vectors RMatrix B(n), H(n) ; // Hessian and it's inverse k_max = max_iter*n ; // Initially B and H are equal to the identity matrix B=0 ; H=0 ; for (i=0 ; i<n ; i++) { B(i,i)=1 ; H(i,i)=1 ; } RVector g_av(x_av.GetLength()); if (axis==-1) { f=glob.ObjectiveGradient(x,g,OBJECTIVE_AND_GRADIENT); } else { x_av(axis)=x(0); f=glob.ObjectiveGradient(x_av,g_av,OBJECTIVE_AND_GRADIENT); g(0)=g_av(axis); } IF_NLOPT_CHECK_EVALS; FC++;GC++; if (axis == -1) { // Skipping AV #ifdef INI3 // Elaborate scheme to initalize delta delta=delta_coef*norm2(g) ; copy(g,z) ; axpy(1.0,x,z) ; if (!box.InsideBox(z)) { if (box.Intersection(x,g,z)==TRUE) { axpy(-1.0,x,z) ; delta=min(delta,delta_coef*norm2(z)) ; } else { // Algorithm broke down, use INI1 delta = (1.0/7)*box.ShortestSide(&iTmp) ; } } #endif #ifdef INI2 // Use INI2 scheme delta = box.ClosestSide(x)*delta_coef ; if (delta<MacEpsilon) // Patch to avoid trust region with radius close to zero delta = (1.0/7)*box.ShortestSide(&iTmp) ; #endif #ifdef INI1 delta = delta_coef*box.ShortestSide(&iTmp) ; #endif } else { // Use a simple scheme for the 1D minimization (INI1) delta = (1.0/7.0)*box.ShortestSide(&iTmp) ; } k=0 ; good_enough = 0 ; info=LS_New ; outside=0 ; maxgrad=*mgr ; while (good_enough == 0) { k++ ; if (k>k_max) { #ifdef LS_DEBUG cout << "Maximum number of iterations reached\n" ; #endif info=LS_MaxIter ; break ; } // Update maximal gradient value maxgrad=max(maxgrad,normInf(g)) ; // Steepest descent, h_sd = -g copy(g,h_sd) ; scal(-1.0,h_sd) ; nrm_sd=norm2(h_sd) ; if (nrm_sd < epsilon) { // Stop criterion (gradient) fullfilled #ifdef LS_DEBUG cout << "Gradient small enough" << endl ; #endif good_enough = 1 ; break ; } // Compute Newton step, h_n = -H*g gemv('N',-1.0, H, g, 0.0, h_n) ; nrm_hn = norm2(h_n) ; if (nrm_hn < delta) { // Pure Newton step copy(h_n, h_dl) ; #ifdef LS_DEBUG cout << "[Newton step] " ; #endif } else { gemv('N',1.0,B,g,0.0,z) ; tmp=dot(g,z) ; if (tmp==0) { info = LS_Unstable ; break ; } alpha=(nrm_sd*nrm_sd)/tmp ; // Normalization (N38,eq. 3.30) scal(alpha,h_sd) ; nrm_sd=fabs(alpha)*nrm_sd ; if (nrm_sd >= delta) { gamma = delta/nrm_sd ; // Normalization (N38, eq. 3.33) copy(h_sd,h_dl) ; scal(gamma,h_dl) ; #ifdef LS_DEBUG cout << "[Steepest descent] " ; #endif } else { // Combination of Newton and SD steps d2 = delta*delta ; copy(h_sd,s) ; s2=nrm_sd*nrm_sd ; nom = d2 - s2 ; snrm_hn=nrm_hn*nrm_hn ; tmp = dot(h_n,s) ; den = tmp-s2 + sqrt((tmp-d2)*(tmp-d2)+(snrm_hn-d2)*(d2-s2)) ; if (den==0) { info = LS_Unstable ; break ; } // Normalization (N38, eq. 3.31) beta = nom/den ; copy(h_n,h_dl) ; scal(beta,h_dl) ; axpy((1-beta),h_sd,h_dl) ; #ifdef LS_DEBUG cout << "[Mixed step] " ; #endif } } nrm_dl=norm2(h_dl) ; //x_new = x+h_dl ; copy(x,x_new) ; axpy(1.0,h_dl,x_new) ; // Check if x_new is inside the box iTmp=box.OutsideBox(x_new, domain) ; if (iTmp == 1) { #ifdef LS_DEBUG cout << "x_new is outside the box " << endl ; #endif outside++ ; if (outside>max_outside_steps) { // Previous point was also outside, exit break ; } } else if (iTmp == 2) { #ifdef LS_DEBUG cout << " x_new is outside the domain" << endl ; #endif info=LS_Out ; break ; } else { outside=0 ; } // Compute the gain if (axis==-1) f_new=glob.ObjectiveGradient(x_new,g_new,OBJECTIVE_AND_GRADIENT); else { x_av(axis)=x_new(0); f_new=glob.ObjectiveGradient(x_av,g_av,OBJECTIVE_AND_GRADIENT); } IF_NLOPT_CHECK_EVALS; FC++; GC++; gemv('N',0.5,B,h_dl,0.0,z); ro = (f_new-f) / (dot(g,h_dl) + dot(h_dl,z)); // Quadratic model if (ro > 0.75) { delta = delta*2; } if (ro < 0.25) { delta = delta/3; } if (ro > 0) { // Update the Hessian and it's inverse using the BFGS formula #if 0 // changed by SGJ to compute OBJECTIVE_AND_GRADIENT above if (axis==-1) glob.ObjectiveGradient(x_new,g_new,GRADIENT_ONLY); else { x_av(axis)=x_new(0); glob.ObjectiveGradient(x_av,g_av,GRADIENT_ONLY); g_new(0)=g_av(axis); } GC++; IF_NLOPT_CHECK_EVALS; #else if (axis != -1) g_new(0)=g_av(axis); #endif // y=g_new-g copy(g_new,y); axpy(-1.0,g,y); // Check curvature condition alpha=dot(y,h_dl); if (alpha <= sqrt(MacEpsilon)*nrm_dl*norm2(y)) { #ifdef LS_DEBUG cout << "Curvature condition violated " ; #endif } else { // Update Hessian gemv('N',1.0,B,h_dl,0.0,z) ; // z=Bh_dl beta=-1/dot(h_dl,z) ; ger(1/alpha,y,y,B) ; ger(beta,z,z,B) ; // Update Hessian inverse gemv('N',1.0,H,y,0.0,z) ; // z=H*y gemv('T',1.0,H,y,0.0,w) ; // w=y'*H beta=dot(y,z) ; beta=(1+beta/alpha)/alpha ; // It should be possible to do this updating more efficiently, by // exploiting the fact that (h_dl*y'*H) = transpose(H*y*h_dl') ger(beta,h_dl,h_dl,H) ; ger(-1/alpha,z,h_dl,H) ; ger(-1/alpha,h_dl,w,H) ; } if (nrm_dl < norm2(x)*epsilon) { // Stop criterion (iteration progress) fullfilled #ifdef LS_DEBUG cout << "Progress is marginal" ; #endif good_enough = 1 ; } // Check if we are close to a stationary point located previously if (box.CloseToMin(x_new, &f_new, eps_cl)) { // Note that x_new and f_new may be overwritten on exit from CloseToMin #ifdef LS_DEBUG cout << "Close to a previously located stationary point, exiting" << endl; #endif info = LS_Old ; good_enough = 1 ; } // Update x, g and f copy(x_new,x) ; copy(g_new,g) ; f=f_new ; #ifdef LS_DEBUG cout << " x=" << x << endl ; #endif } else { #ifdef LS_DEBUG cout << "Step is no good, ro=" << ro << " delta=" << delta << endl ; #endif } } // wend // Make sure the routine returns correctly... // Check if last iterate is outside the boundary if (box.OutsideBox(x, domain) != 0) { info=LS_Out; f=DBL_MAX; } if (info == LS_Unstable) { cout << "Local search became unstable. No big deal but exiting anyway\n" ; exit(1); } *mgr=maxgrad ; T.xvals=x ; T.objval=f ; if (outside>0) return LS_Out ; else return info ; #endif }
Array<T> matmul(const Array<T> &lhs, const Array<T> &rhs, af_mat_prop optLhs, af_mat_prop optRhs) { #if defined(WITH_LINEAR_ALGEBRA) if(OpenCLCPUOffload(false)) { // Do not force offload gemm on OSX Intel devices return cpu::matmul(lhs, rhs, optLhs, optRhs); } #endif const auto lOpts = toBlasTranspose(optLhs); const auto rOpts = toBlasTranspose(optRhs); const auto aRowDim = (lOpts == OPENCL_BLAS_NO_TRANS) ? 0 : 1; const auto aColDim = (lOpts == OPENCL_BLAS_NO_TRANS) ? 1 : 0; const auto bColDim = (rOpts == OPENCL_BLAS_NO_TRANS) ? 1 : 0; const dim4 lDims = lhs.dims(); const dim4 rDims = rhs.dims(); const int M = lDims[aRowDim]; const int N = rDims[bColDim]; const int K = lDims[aColDim]; dim_t d2 = std::max(lDims[2], rDims[2]); dim_t d3 = std::max(lDims[3], rDims[3]); dim4 oDims = af::dim4(M, N, d2, d3); Array<T> out = createEmptyArray<T>(oDims); const auto alpha = scalar<T>(1); const auto beta = scalar<T>(0); const dim4 lStrides = lhs.strides(); const dim4 rStrides = rhs.strides(); const dim4 oStrides = out.strides(); int batchSize = oDims[2] * oDims[3]; bool is_l_d2_batched = oDims[2] == lDims[2]; bool is_l_d3_batched = oDims[3] == lDims[3]; bool is_r_d2_batched = oDims[2] == rDims[2]; bool is_r_d3_batched = oDims[3] == rDims[3]; for (int n = 0; n < batchSize; n++) { int w = n / rDims[2]; int z = n - w * rDims[2]; int loff = z * (is_l_d2_batched * lStrides[2]) + w * (is_l_d3_batched * lStrides[3]); int roff = z * (is_r_d2_batched * rStrides[2]) + w * (is_r_d3_batched * rStrides[3]); dim_t lOffset = lhs.getOffset() + loff; dim_t rOffset = rhs.getOffset() + roff; dim_t oOffset = out.getOffset() + z * oStrides[2] + w * oStrides[3]; cl::Event event; if(rDims[bColDim] == 1) { dim_t incr = (optRhs == AF_MAT_NONE) ? rStrides[0] : rStrides[1]; gpu_blas_gemv_func<T> gemv; OPENCL_BLAS_CHECK( gemv(lOpts, lDims[0], lDims[1], alpha, (*lhs.get())(), lOffset, lStrides[1], (*rhs.get())(), rOffset, incr, beta, (*out.get())(), oOffset, 1, 1, &getQueue()(), 0, nullptr, &event()) ); } else { gpu_blas_gemm_func<T> gemm; OPENCL_BLAS_CHECK( gemm(lOpts, rOpts, M, N, K, alpha, (*lhs.get())(), lOffset, lStrides[1], (*rhs.get())(), rOffset, rStrides[1], beta, (*out.get())(), oOffset, out.dims()[0], 1, &getQueue()(), 0, nullptr, &event()) ); } } return out; }
int CGSolver::solve_cabicgstab (MultiFab& sol, const MultiFab& rhs, Real eps_rel, Real eps_abs, LinOp::BC_Mode bc_mode) { BL_PROFILE("CGSolver::solve_cabicgstab()"); BL_ASSERT(sol.nComp() == 1); BL_ASSERT(sol.boxArray() == Lp.boxArray(lev)); BL_ASSERT(rhs.boxArray() == Lp.boxArray(lev)); Real temp1[4*SSS_MAX+1]; Real temp2[4*SSS_MAX+1]; Real temp3[4*SSS_MAX+1]; Real Tp[4*SSS_MAX+1][4*SSS_MAX+1]; Real Tpp[4*SSS_MAX+1][4*SSS_MAX+1]; Real aj[4*SSS_MAX+1]; Real cj[4*SSS_MAX+1]; Real ej[4*SSS_MAX+1]; Real Tpaj[4*SSS_MAX+1]; Real Tpcj[4*SSS_MAX+1]; Real Tppaj[4*SSS_MAX+1]; Real G[4*SSS_MAX+1][4*SSS_MAX+1]; // Extracted from first 4*SSS+1 columns of Gg[][]. indexed as [row][col] Real g[4*SSS_MAX+1]; // Extracted from last [4*SSS+1] column of Gg[][]. Real Gg[(4*SSS_MAX+1)*(4*SSS_MAX+2)]; // Buffer to hold the Gram-like matrix produced by matmul(). indexed as [row*(4*SSS+2) + col] // // If variable_SSS we "telescope" SSS. // We start with 1 and increase it up to SSS_MAX on the outer iterations. // if (variable_SSS) SSS = 1; zero( aj, 4*SSS_MAX+1); zero( cj, 4*SSS_MAX+1); zero( ej, 4*SSS_MAX+1); zero( Tpaj, 4*SSS_MAX+1); zero( Tpcj, 4*SSS_MAX+1); zero(Tppaj, 4*SSS_MAX+1); zero(temp1, 4*SSS_MAX+1); zero(temp2, 4*SSS_MAX+1); zero(temp3, 4*SSS_MAX+1); SetMonomialBasis(Tp,Tpp,SSS); const int ncomp = 1, nghost = sol.nGrow(); // // Contains the matrix powers of p[] and r[]. // // First 2*SSS+1 components are powers of p[]. // Next 2*SSS components are powers of r[]. // const BoxArray& ba = sol.boxArray(); const DistributionMapping& dm = sol.DistributionMap(); MultiFab PR(ba, 4*SSS_MAX+1, 0, dm); MultiFab p(ba, ncomp, 0, dm); MultiFab r(ba, ncomp, 0, dm); MultiFab rt(ba, ncomp, 0, dm); MultiFab tmp(ba, 4, nghost, dm); Lp.residual(r, rhs, sol, lev, bc_mode); BL_ASSERT(!r.contains_nan()); MultiFab::Copy(rt,r,0,0,1,0); MultiFab::Copy( p,r,0,0,1,0); const Real rnorm0 = norm_inf(r); Real delta = dotxy(r,rt); const Real L2_norm_of_rt = sqrt(delta); const LinOp::BC_Mode temp_bc_mode = LinOp::Homogeneous_BC; if ( verbose > 0 && ParallelDescriptor::IOProcessor(color()) ) { Spacer(std::cout, lev); std::cout << "CGSolver_CABiCGStab: Initial error (error0) = " << rnorm0 << '\n'; } if ( rnorm0 == 0 || delta == 0 || rnorm0 < eps_abs ) { if ( verbose > 0 && ParallelDescriptor::IOProcessor(color()) ) { Spacer(std::cout, lev); std::cout << "CGSolver_CABiCGStab: niter = 0," << ", rnorm = " << rnorm0 << ", delta = " << delta << ", eps_abs = " << eps_abs << '\n'; } return 0; } int niters = 0, ret = 0; Real L2_norm_of_resid = 0, atime = 0, gtime = 0; bool BiCGStabFailed = false, BiCGStabConverged = false; for (int m = 0; m < maxiter && !BiCGStabFailed && !BiCGStabConverged; ) { const Real time1 = ParallelDescriptor::second(); // // Compute the matrix powers on p[] & r[] (monomial basis). // The 2*SSS+1 powers of p[] followed by the 2*SSS powers of r[]. // MultiFab::Copy(PR,p,0,0,1,0); MultiFab::Copy(PR,r,0,2*SSS+1,1,0); BL_ASSERT(!PR.contains_nan(0, 1)); BL_ASSERT(!PR.contains_nan(2*SSS+1,1)); // // We use "tmp" to minimize the number of Lp.apply()s. // We do this by doing p & r together in a single call. // MultiFab::Copy(tmp,p,0,0,1,0); MultiFab::Copy(tmp,r,0,1,1,0); for (int n = 1; n < 2*SSS; n++) { Lp.apply(tmp, tmp, lev, temp_bc_mode, false, 0, 2, 2); MultiFab::Copy(tmp,tmp,2,0,2,0); MultiFab::Copy(PR,tmp,0, n,1,0); MultiFab::Copy(PR,tmp,1,2*SSS+n+1,1,0); BL_ASSERT(!PR.contains_nan(n, 1)); BL_ASSERT(!PR.contains_nan(2*SSS+n+1,1)); } MultiFab::Copy(tmp,PR,2*SSS-1,0,1,0); Lp.apply(tmp, tmp, lev, temp_bc_mode, false, 0, 1, 1); MultiFab::Copy(PR,tmp,1,2*SSS,1,0); BL_ASSERT(!PR.contains_nan(2*SSS-1,1)); BL_ASSERT(!PR.contains_nan(2*SSS, 1)); Real time2 = ParallelDescriptor::second(); atime += (time2-time1); BuildGramMatrix(Gg, PR, rt, SSS); const Real time3 = ParallelDescriptor::second(); gtime += (time3-time2); // // Form G[][] and g[] from Gg. // for (int i = 0, k = 0; i < 4*SSS+1; i++) { for (int j = 0; j < 4*SSS+1; j++) // // First 4*SSS+1 elements in each row go to G[][]. // G[i][j] = Gg[k++]; // // Last element in row goes to g[]. // g[i] = Gg[k++]; } zero(aj, 4*SSS+1); aj[0] = 1; zero(cj, 4*SSS+1); cj[2*SSS+1] = 1; zero(ej, 4*SSS+1); for (int nit = 0; nit < SSS; nit++) { gemv( Tpaj, Tp, aj, 4*SSS+1, 4*SSS+1); gemv( Tpcj, Tp, cj, 4*SSS+1, 4*SSS+1); gemv(Tppaj, Tpp, aj, 4*SSS+1, 4*SSS+1); const Real g_dot_Tpaj = dot(g, Tpaj, 4*SSS+1); if ( g_dot_Tpaj == 0 ) { if ( verbose > 1 && ParallelDescriptor::IOProcessor(color()) ) std::cout << "CGSolver_CABiCGStab: g_dot_Tpaj == 0, nit = " << nit << '\n'; BiCGStabFailed = true; ret = 1; break; } const Real alpha = delta / g_dot_Tpaj; if ( std::isinf(alpha) ) { if ( verbose > 1 && ParallelDescriptor::IOProcessor(color()) ) std::cout << "CGSolver_CABiCGStab: alpha == inf, nit = " << nit << '\n'; BiCGStabFailed = true; ret = 2; break; } axpy(temp1, Tpcj, -alpha, Tppaj, 4*SSS+1); gemv(temp2, G, temp1, 4*SSS+1, 4*SSS+1); axpy(temp3, cj, -alpha, Tpaj, 4*SSS+1); const Real omega_numerator = dot(temp3, temp2, 4*SSS+1); const Real omega_denominator = dot(temp1, temp2, 4*SSS+1); // // NOTE: omega_numerator/omega_denominator can be 0/x or 0/0, but should never be x/0. // // If omega_numerator==0, and ||s||==0, then convergence, x=x+alpha*aj. // If omega_numerator==0, and ||s||!=0, then stabilization breakdown. // // Partial update of ej must happen before the check on omega to ensure forward progress !!! // axpy(ej, ej, alpha, aj, 4*SSS+1); // // ej has been updated so consider that we've done an iteration since // even if we break out of the loop we'll be able to update both sol. // niters++; // // Calculate the norm of Saad's vector 's' to check intra s-step convergence. // axpy(temp1, cj,-alpha, Tpaj, 4*SSS+1); gemv(temp2, G, temp1, 4*SSS+1, 4*SSS+1); const Real L2_norm_of_s = dot(temp1,temp2,4*SSS+1); L2_norm_of_resid = (L2_norm_of_s < 0 ? 0 : sqrt(L2_norm_of_s)); if ( L2_norm_of_resid < eps_rel*L2_norm_of_rt ) { if ( verbose > 1 && L2_norm_of_resid == 0 && ParallelDescriptor::IOProcessor(color()) ) std::cout << "CGSolver_CABiCGStab: L2 norm of s: " << L2_norm_of_s << '\n'; BiCGStabConverged = true; break; } if ( omega_denominator == 0 ) { if ( verbose > 1 && ParallelDescriptor::IOProcessor(color()) ) std::cout << "CGSolver_CABiCGStab: omega_denominator == 0, nit = " << nit << '\n'; BiCGStabFailed = true; ret = 3; break; } const Real omega = omega_numerator / omega_denominator; if ( verbose > 1 && ParallelDescriptor::IOProcessor(color()) ) { if ( omega == 0 ) std::cout << "CGSolver_CABiCGStab: omega == 0, nit = " << nit << '\n'; if ( std::isinf(omega) ) std::cout << "CGSolver_CABiCGStab: omega == inf, nit = " << nit << '\n'; } if ( omega == 0 ) { BiCGStabFailed = true; ret = 4; break; } if ( std::isinf(omega) ) { BiCGStabFailed = true; ret = 4; break; } // // Complete the update of ej & cj now that omega is known to be ok. // axpy(ej, ej, omega, cj, 4*SSS+1); axpy(ej, ej,-omega*alpha, Tpaj, 4*SSS+1); axpy(cj, cj, -omega, Tpcj, 4*SSS+1); axpy(cj, cj, -alpha, Tpaj, 4*SSS+1); axpy(cj, cj, omega*alpha, Tppaj, 4*SSS+1); // // Do an early check of the residual to determine convergence. // gemv(temp1, G, cj, 4*SSS+1, 4*SSS+1); // // sqrt( (cj,Gcj) ) == L2 norm of the intermediate residual in exact arithmetic. // However, finite precision can lead to the norm^2 being < 0 (Jim Demmel). // If cj_dot_Gcj < 0 we flush to zero and consider ourselves converged. // const Real L2_norm_of_r = dot(cj, temp1, 4*SSS+1); L2_norm_of_resid = (L2_norm_of_r > 0 ? sqrt(L2_norm_of_r) : 0); if ( L2_norm_of_resid < eps_rel*L2_norm_of_rt ) { if ( verbose > 1 && L2_norm_of_resid == 0 && ParallelDescriptor::IOProcessor(color()) ) std::cout << "CGSolver_CABiCGStab: L2_norm_of_r: " << L2_norm_of_r << '\n'; BiCGStabConverged = true; break; } const Real delta_next = dot(g, cj, 4*SSS+1); if ( verbose > 1 && ParallelDescriptor::IOProcessor(color()) ) { if ( delta_next == 0 ) std::cout << "CGSolver_CABiCGStab: delta == 0, nit = " << nit << '\n'; if ( std::isinf(delta_next) ) std::cout << "CGSolver_CABiCGStab: delta == inf, nit = " << nit << '\n'; } if ( std::isinf(delta_next) ) { BiCGStabFailed = true; ret = 5; break; } // delta = inf? if ( delta_next == 0 ) { BiCGStabFailed = true; ret = 5; break; } // Lanczos breakdown... const Real beta = (delta_next/delta)*(alpha/omega); if ( verbose > 1 && ParallelDescriptor::IOProcessor(color()) ) { if ( beta == 0 ) std::cout << "CGSolver_CABiCGStab: beta == 0, nit = " << nit << '\n'; if ( std::isinf(beta) ) std::cout << "CGSolver_CABiCGStab: beta == inf, nit = " << nit << '\n'; } if ( std::isinf(beta) ) { BiCGStabFailed = true; ret = 6; break; } // beta = inf? if ( beta == 0 ) { BiCGStabFailed = true; ret = 6; break; } // beta = 0? can't make further progress(?) axpy(aj, cj, beta, aj, 4*SSS+1); axpy(aj, aj, -omega*beta, Tpaj, 4*SSS+1); delta = delta_next; } // // Update iterates. // for (int i = 0; i < 4*SSS+1; i++) sxay(sol,sol,ej[i],PR,i); MultiFab::Copy(p,PR,0,0,1,0); p.mult(aj[0],0,1); for (int i = 1; i < 4*SSS+1; i++) sxay(p,p,aj[i],PR,i); MultiFab::Copy(r,PR,0,0,1,0); r.mult(cj[0],0,1); for (int i = 1; i < 4*SSS+1; i++) sxay(r,r,cj[i],PR,i); if ( !BiCGStabFailed && !BiCGStabConverged ) { m += SSS; if ( variable_SSS && SSS < SSS_MAX ) { SSS++; SetMonomialBasis(Tp,Tpp,SSS); } } } if ( verbose > 0 ) { if ( ParallelDescriptor::IOProcessor(color()) ) { Spacer(std::cout, lev); std::cout << "CGSolver_CABiCGStab: Final: Iteration " << std::setw(4) << niters << " rel. err. " << L2_norm_of_resid << '\n'; } if ( verbose > 1 ) { Real tmp[2] = { atime, gtime }; ParallelDescriptor::ReduceRealMax(tmp,2,color()); if ( ParallelDescriptor::IOProcessor(color()) ) { Spacer(std::cout, lev); std::cout << "CGSolver_CABiCGStab apply time: " << tmp[0] << ", gram time: " << tmp[1] << '\n'; } } } if ( niters >= maxiter && !BiCGStabFailed && !BiCGStabConverged) { if ( L2_norm_of_resid > L2_norm_of_rt ) { if ( ParallelDescriptor::IOProcessor(color()) ) BoxLib::Warning("CGSolver_CABiCGStab: failed to converge!"); // // Return code 8 tells the MultiGrid driver to zero out the solution! // ret = 8; } else { // // Return codes 1-7 tells the MultiGrid driver to smooth the solution! // ret = 7; } } return ret; }