/* * Solves lapl(u) x = b, for x, given b, using Conjugate Gradient */ void cg(latparams lp, field **x, field **b, link **g) { size_t L = lp.L; int max_iter = 100; float tol = 1e-9; /* Temporary fields needed for CG */ field **r = new_field(lp); field **p = new_field(lp); field **Ap = new_field(lp); /* Initial residual and p-vector */ lapl(lp, r, x, g); xmy(lp, b, r); xeqy(lp, p, r); /* Initial r-norm and b-norm */ float rr = xdotx(lp, r); float bb = xdotx(lp, b); double t_lapl = 0; int iter = 0; for(iter=0; iter<max_iter; iter++) { printf(" %6d, res = %+e\n", iter, rr/bb); if(sqrt(rr/bb) < tol) break; double t = stop_watch(0); lapl(lp, Ap, p, g); t_lapl += stop_watch(t); float pAp = xdoty(lp, p, Ap); float alpha = rr/pAp; axpy(lp, alpha, p, x); axpy(lp, -alpha, Ap, r); float r1r1 = xdotx(lp, r); float beta = r1r1/rr; xpay(lp, r, beta, p); rr = r1r1; } /* Recompute residual after convergence */ lapl(lp, r, x, g); xmy(lp, b, r); rr = xdotx(lp, r); double beta_fp = 50*((double)L*L*L)/(t_lapl/(double)iter)*1e-9; double beta_io = 40*((double)L*L*L)/(t_lapl/(double)iter)*1e-9; printf(" Converged after %6d iterations, res = %+e\n", iter, rr/bb); printf(" Time in lapl(): %+6.3e sec/call, %4.2e Gflop/s, %4.2e GB/s\n", t_lapl/(double)iter, beta_fp, beta_io); del_field(r); del_field(p); del_field(Ap); return; }
/* * Solves lapl(u) x = b, for x, given b, using Conjugate Gradient */ void cg(size_t L, _Complex float *x, _Complex float *b, _Complex float *u) { int max_iter = 100; float tol = 1e-6; /* Temporary fields needed for CG */ _Complex float *r = new_field(L); _Complex float *p = new_field(L); _Complex float *Ap = new_field(L); /* Initial residual and p-vector */ lapl(L, r, x, u); xmy(L, b, r); xeqy(L, p, r); /* Initial r-norm and b-norm */ float rr = xdotx(L, r); float bb = xdotx(L, b); double t_lapl = 0; int iter = 0; for(iter=0; iter<max_iter; iter++) { printf(" %6d, res = %+e\n", iter, rr/bb); if(sqrt(rr/bb) < tol) break; double t = stop_watch(0); lapl(L, Ap, p, u); t_lapl += stop_watch(t); float pAp = xdoty(L, p, Ap); float alpha = rr/pAp; axpy(L, alpha, p, x); axpy(L, -alpha, Ap, r); float r1r1 = xdotx(L, r); float beta = r1r1/rr; xpay(L, r, beta, p); rr = r1r1; } /* Recompute residual after convergence */ lapl(L, r, x, u); xmy(L, b, r); rr = xdotx(L, r); double beta_fp = 34*L*L/(t_lapl/(double)iter)*1e-9; double beta_io = 32*L*L/(t_lapl/(double)iter)*1e-9; printf(" Converged after %6d iterations, res = %+e\n", iter, rr/bb); printf(" Time in lapl(): %+6.3e sec/call, %4.2e Gflop/s, %4.2e GB/s\n", t_lapl/(double)iter, beta_fp, beta_io); free(r); free(p); free(Ap); return; }
int main() { DenseVector sings; GeMat deltas(3,2); std::vector<GeMat> _deltas; Function x2f(x2, sings); Function onef(one, sings); Function x3f(x3, sings); Function cosf(mycos, sings); Function sinf(mysin, sings); Function expf(myexp, sings); Basis basis(4); basis.template enforceBoundaryCondition<lawa::DirichletBC>(); IndexSet indexset; Coeff1D coeff; std::vector<Function> fvec; int rank = 2; int dim = 64; for (int i=1; i<=32; ++i) { fvec.push_back(cosf); fvec.push_back(onef); fvec.push_back(x2f); fvec.push_back(onef); } SepCoeff coeffs(rank, dim); IndexSetVec indexsetvec(dim); lawa::SeparableFunctionD<T> F(fvec, rank, dim); MatInt derivs(rank, dim); for (int i=1; i<=rank; ++i) { for (int j=1; j<=dim; ++j) { derivs(i,j) = 0; _deltas.push_back(deltas); } } lawa::SeparableRHSD<T, Basis> Fint(basis, F, _deltas, derivs); getFullIndexSet(basis, indexset, 2); std::cout << "The index set size is\n" << indexset.size() << std::endl; for (int l=0; (unsigned)l<indexsetvec.size(); ++l) { indexsetvec[l] = indexset; } /* Map */ lawa::Mapwavind<Index1D> map(dim); map.rehash(50); genCoefficients(coeffs, Fint, indexsetvec); lawa::HTCoefficients<T, Basis> f(dim, basis, map); lawa::HTCoefficients<T, Basis> u(dim, basis, map); lawa::HTCoefficients<T, Basis> r(dim, basis, map); Laplace1D LaplaceBil(basis); RefLaplace1D RefLaplaceBil(basis.refinementbasis); Identity1D IdentityBil(basis); RefIdentity1D RefIdentityBil(basis.refinementbasis); LOp_Lapl1D lapl(basis, basis, RefLaplaceBil, LaplaceBil); Sepop A(lapl, dim, dim); lawa::Sepdiagscal<Basis> S(dim, basis); setScaling(S, 0.5); lawa::HTAWGM_Params params; params.maxit_pcg = 100; params.maxit_awgm = 100; params.tol_awgm = 1e-08; params.delta1_pcg = 1e-01; params.delta2_pcg = 1e-01; params.delta3_pcg = 1e-01; params.alpha = 0.95; params.recompr = 1e-02; params.gamma = 0.1; params.theta = 1e-08; std::cout << "HTAWGM params =\n"; std::cout << params << std::endl; unsigned its; double res; its = htawgm(A, S, u, Fint, indexsetvec, res, params); std::cout << "htawgm took " << its << " iterations to reach " << res << " accuracy" << std::endl; std::cout << "Final scaling set to\n" << S << std::endl; return 0; }
bool VMCcuda::run() { if (UseDrift == "yes") return runWithDrift(); resetRun(); IndexType block = 0; IndexType nAcceptTot = 0; IndexType nRejectTot = 0; IndexType updatePeriod= (QMCDriverMode[QMC_UPDATE_MODE]) ? Period4CheckProperties : (nBlocks+1)*nSteps; int nat = W.getTotalNum(); int nw = W.getActiveWalkers(); vector<RealType> LocalEnergy(nw); vector<PosType> delpos(nw); vector<PosType> newpos(nw); vector<ValueType> ratios(nw); vector<GradType> oldG(nw), newG(nw); vector<ValueType> oldL(nw), newL(nw); vector<Walker_t*> accepted(nw); Matrix<ValueType> lapl(nw, nat); Matrix<GradType> grad(nw, nat); double Esum; // First do warmup steps for (int step=0; step<myWarmupSteps; step++) { for(int iat=0; iat<nat; ++iat) { //create a 3N-Dimensional Gaussian with variance=1 makeGaussRandomWithEngine(delpos,Random); for(int iw=0; iw<nw; ++iw) { PosType G = W[iw]->Grad[iat]; newpos[iw]=W[iw]->R[iat] + m_sqrttau*delpos[iw]; ratios[iw] = 1.0; } W.proposeMove_GPU(newpos, iat); Psi.ratio(W,iat,ratios,newG, newL); accepted.clear(); vector<bool> acc(nw, false); for(int iw=0; iw<nw; ++iw) { if(ratios[iw]*ratios[iw] > Random()) { accepted.push_back(W[iw]); nAccept++; W[iw]->R[iat] = newpos[iw]; acc[iw] = true; } else nReject++; } W.acceptMove_GPU(acc); if (accepted.size()) Psi.update(accepted,iat); } } do { IndexType step = 0; nAccept = nReject = 0; Esum = 0.0; Estimators->startBlock(nSteps); do { ++step; ++CurrentStep; for (int isub=0; isub<nSubSteps; isub++) { for(int iat=0; iat<nat; ++iat) { //create a 3N-Dimensional Gaussian with variance=1 makeGaussRandomWithEngine(delpos,Random); for(int iw=0; iw<nw; ++iw) { PosType G = W[iw]->Grad[iat]; newpos[iw]=W[iw]->R[iat] + m_sqrttau*delpos[iw]; ratios[iw] = 1.0; } W.proposeMove_GPU(newpos, iat); Psi.ratio(W,iat,ratios,newG, newL); accepted.clear(); vector<bool> acc(nw, false); for(int iw=0; iw<nw; ++iw) { if(ratios[iw]*ratios[iw] > Random()) { accepted.push_back(W[iw]); nAccept++; W[iw]->R[iat] = newpos[iw]; acc[iw] = true; } else nReject++; } W.acceptMove_GPU(acc); if (accepted.size()) Psi.update(accepted,iat); } } Psi.gradLapl(W, grad, lapl); H.evaluate (W, LocalEnergy); if (myPeriod4WalkerDump && (CurrentStep % myPeriod4WalkerDump)==0) W.saveEnsemble(); Estimators->accumulate(W); } while(step<nSteps); Psi.recompute(W); // vector<RealType> logPsi(W.WalkerList.size(), 0.0); // Psi.evaluateLog(W, logPsi); double accept_ratio = (double)nAccept/(double)(nAccept+nReject); Estimators->stopBlock(accept_ratio); nAcceptTot += nAccept; nRejectTot += nReject; ++block; recordBlock(block); } while(block<nBlocks); //Mover->stopRun(); //finalize a qmc section return finalize(block); }
bool VMCcuda::runWithDrift() { resetRun(); IndexType block = 0; IndexType nAcceptTot = 0; IndexType nRejectTot = 0; int nat = W.getTotalNum(); int nw = W.getActiveWalkers(); vector<RealType> LocalEnergy(nw), oldScale(nw), newScale(nw); vector<PosType> delpos(nw); vector<PosType> dr(nw); vector<PosType> newpos(nw); vector<ValueType> ratios(nw), rplus(nw), rminus(nw); vector<PosType> oldG(nw), newG(nw); vector<ValueType> oldL(nw), newL(nw); vector<Walker_t*> accepted(nw); Matrix<ValueType> lapl(nw, nat); Matrix<GradType> grad(nw, nat); // First, do warmup steps for (int step=0; step<myWarmupSteps; step++) { for(int iat=0; iat<nat; iat++) { Psi.getGradient (W, iat, oldG); //create a 3N-Dimensional Gaussian with variance=1 makeGaussRandomWithEngine(delpos,Random); for(int iw=0; iw<nw; iw++) { oldScale[iw] = getDriftScale(m_tauovermass,oldG[iw]); dr[iw] = (m_sqrttau*delpos[iw]) + (oldScale[iw]*oldG[iw]); newpos[iw]=W[iw]->R[iat] + dr[iw]; ratios[iw] = 1.0; } W.proposeMove_GPU(newpos, iat); Psi.ratio(W,iat,ratios,newG, newL); accepted.clear(); vector<bool> acc(nw, false); for(int iw=0; iw<nw; ++iw) { PosType drOld = newpos[iw] - (W[iw]->R[iat] + oldScale[iw]*oldG[iw]); RealType logGf = -m_oneover2tau * dot(drOld, drOld); newScale[iw] = getDriftScale(m_tauovermass,newG[iw]); PosType drNew = (newpos[iw] + newScale[iw]*newG[iw]) - W[iw]->R[iat]; RealType logGb = -m_oneover2tau * dot(drNew, drNew); RealType x = logGb - logGf; RealType prob = ratios[iw]*ratios[iw]*std::exp(x); if(Random() < prob) { accepted.push_back(W[iw]); nAccept++; W[iw]->R[iat] = newpos[iw]; acc[iw] = true; } else nReject++; } W.acceptMove_GPU(acc); if (accepted.size()) Psi.update(accepted,iat); } } // Now do data collection steps do { IndexType step = 0; nAccept = nReject = 0; Estimators->startBlock(nSteps); do { step++; CurrentStep++; for (int isub=0; isub<nSubSteps; isub++) { for(int iat=0; iat<nat; iat++) { Psi.getGradient (W, iat, oldG); //create a 3N-Dimensional Gaussian with variance=1 makeGaussRandomWithEngine(delpos,Random); for(int iw=0; iw<nw; iw++) { oldScale[iw] = getDriftScale(m_tauovermass,oldG[iw]); dr[iw] = (m_sqrttau*delpos[iw]) + (oldScale[iw]*oldG[iw]); newpos[iw]=W[iw]->R[iat] + dr[iw]; ratios[iw] = 1.0; } W.proposeMove_GPU(newpos, iat); Psi.ratio(W,iat,ratios,newG, newL); accepted.clear(); vector<bool> acc(nw, false); for(int iw=0; iw<nw; ++iw) { PosType drOld = newpos[iw] - (W[iw]->R[iat] + oldScale[iw]*oldG[iw]); // if (dot(drOld, drOld) > 25.0) // cerr << "Large drift encountered! Old drift = " << drOld << endl; RealType logGf = -m_oneover2tau * dot(drOld, drOld); newScale[iw] = getDriftScale(m_tauovermass,newG[iw]); PosType drNew = (newpos[iw] + newScale[iw]*newG[iw]) - W[iw]->R[iat]; // if (dot(drNew, drNew) > 25.0) // cerr << "Large drift encountered! Drift = " << drNew << endl; RealType logGb = -m_oneover2tau * dot(drNew, drNew); RealType x = logGb - logGf; RealType prob = ratios[iw]*ratios[iw]*std::exp(x); if(Random() < prob) { accepted.push_back(W[iw]); nAccept++; W[iw]->R[iat] = newpos[iw]; acc[iw] = true; } else nReject++; } W.acceptMove_GPU(acc); if (accepted.size()) Psi.update(accepted,iat); } // cerr << "Rank = " << myComm->rank() << // " CurrentStep = " << CurrentStep << " isub = " << isub << endl; } Psi.gradLapl(W, grad, lapl); H.evaluate (W, LocalEnergy); if (myPeriod4WalkerDump && (CurrentStep % myPeriod4WalkerDump)==0) W.saveEnsemble(); Estimators->accumulate(W); } while(step<nSteps); Psi.recompute(W); double accept_ratio = (double)nAccept/(double)(nAccept+nReject); Estimators->stopBlock(accept_ratio); nAcceptTot += nAccept; nRejectTot += nReject; ++block; recordBlock(block); } while(block<nBlocks); //finalize a qmc section if (!myComm->rank()) gpu::cuda_memory_manager.report(); return finalize(block); }
bool DMCcuda::run() { bool NLmove = NonLocalMove == "yes"; bool scaleweight = ScaleWeight == "yes"; if (NLmove) app_log() << " Using Casula nonlocal moves in DMCcuda.\n"; if (scaleweight) app_log() << " Scaling weight per Umrigar/Nightengale.\n"; resetRun(); Mover->MaxAge = 1; IndexType block = 0; IndexType nAcceptTot = 0; IndexType nRejectTot = 0; int nat = W.getTotalNum(); int nw = W.getActiveWalkers(); vector<RealType> LocalEnergy(nw), LocalEnergyOld(nw), oldScale(nw), newScale(nw); vector<PosType> delpos(nw); vector<PosType> dr(nw); vector<PosType> newpos(nw); vector<ValueType> ratios(nw), rplus(nw), rminus(nw), R2prop(nw), R2acc(nw); vector<PosType> oldG(nw), newG(nw); vector<ValueType> oldL(nw), newL(nw); vector<Walker_t*> accepted(nw); Matrix<ValueType> lapl(nw, nat); Matrix<GradType> grad(nw, nat); vector<ValueType> V2(nw), V2bar(nw); vector<vector<NonLocalData> > Txy(nw); for (int iw=0; iw<nw; iw++) W[iw]->Weight = 1.0; do { IndexType step = 0; nAccept = nReject = 0; Estimators->startBlock(nSteps); do { step++; CurrentStep++; nw = W.getActiveWalkers(); ResizeTimer.start(); LocalEnergy.resize(nw); oldScale.resize(nw); newScale.resize(nw); delpos.resize(nw); dr.resize(nw); newpos.resize(nw); ratios.resize(nw); rplus.resize(nw); rminus.resize(nw); oldG.resize(nw); newG.resize(nw); oldL.resize(nw); newL.resize(nw); accepted.resize(nw); lapl.resize(nw, nat); grad.resize(nw, nat); R2prop.resize(nw,0.0); R2acc.resize(nw,0.0); V2.resize(nw,0.0); V2bar.resize(nw,0.0); W.updateLists_GPU(); ResizeTimer.stop(); if (NLmove) { Txy.resize(nw); for (int iw=0; iw<nw; iw++) { Txy[iw].clear(); Txy[iw].push_back(NonLocalData(-1, 1.0, PosType())); } } for (int iw=0; iw<nw; iw++) W[iw]->Age++; DriftDiffuseTimer.start(); for(int iat=0; iat<nat; iat++) { Psi.getGradient (W, iat, oldG); //create a 3N-Dimensional Gaussian with variance=1 makeGaussRandomWithEngine(delpos,Random); for(int iw=0; iw<nw; iw++) { delpos[iw] *= m_sqrttau; oldScale[iw] = getDriftScale(m_tauovermass,oldG[iw]); dr[iw] = delpos[iw] + (oldScale[iw]*oldG[iw]); newpos[iw]=W[iw]->R[iat] + dr[iw]; ratios[iw] = 1.0; R2prop[iw] += dot(delpos[iw], delpos[iw]); } W.proposeMove_GPU(newpos, iat); Psi.ratio(W,iat,ratios,newG, newL); accepted.clear(); vector<bool> acc(nw, false); for(int iw=0; iw<nw; ++iw) { PosType drOld = newpos[iw] - (W[iw]->R[iat] + oldScale[iw]*oldG[iw]); RealType logGf = -m_oneover2tau * dot(drOld, drOld); newScale[iw] = getDriftScale(m_tauovermass,newG[iw]); PosType drNew = (newpos[iw] + newScale[iw]*newG[iw]) - W[iw]->R[iat]; RealType logGb = -m_oneover2tau * dot(drNew, drNew); RealType x = logGb - logGf; RealType prob = ratios[iw]*ratios[iw]*std::exp(x); if(Random() < prob && ratios[iw] > 0.0) { accepted.push_back(W[iw]); nAccept++; W[iw]->R[iat] = newpos[iw]; W[iw]->Age = 0; acc[iw] = true; R2acc[iw] += dot(delpos[iw], delpos[iw]); V2[iw] += m_tauovermass * m_tauovermass * dot(newG[iw],newG[iw]); V2bar[iw] += newScale[iw] * newScale[iw] * dot(newG[iw],newG[iw]); } else { nReject++; V2[iw] += m_tauovermass * m_tauovermass * dot(oldG[iw],oldG[iw]); V2bar[iw] += oldScale[iw] * oldScale[iw] * dot(oldG[iw],oldG[iw]); } } W.acceptMove_GPU(acc); if (accepted.size()) Psi.update(accepted,iat); } DriftDiffuseTimer.stop(); // Psi.recompute(W, false); Psi.gradLapl(W, grad, lapl); HTimer.start(); if (NLmove) H.evaluate (W, LocalEnergy, Txy); else H.evaluate (W, LocalEnergy); HTimer.stop(); // for (int iw=0; iw<nw; iw++) { // branchEngine->clampEnergy(LocalEnergy[iw]); // W[iw]->getPropertyBase()[LOCALENERGY] = LocalEnergy[iw]; // } if (CurrentStep == 1) LocalEnergyOld = LocalEnergy; if (NLmove) { // Now, attempt nonlocal move accepted.clear(); vector<int> iatList; vector<PosType> accPos; for (int iw=0; iw<nw; iw++) { /// HACK HACK HACK // if (LocalEnergy[iw] < -2300.0) { // cerr << "Walker " << iw << " has energy " // << LocalEnergy[iw] << endl;; // double maxWeight = 0.0; // int elMax = -1; // PosType posMax; // for (int j=1; j<Txy[iw].size(); j++) // if (std::fabs(Txy[iw][j].Weight) > std::fabs(maxWeight)) { // maxWeight = Txy[iw][j].Weight; // elMax = Txy[iw][j].PID; // posMax = W[iw]->R[elMax] + Txy[iw][j].Delta; // } // cerr << "Maximum weight is " << maxWeight << " for electron " // << elMax << " at position " << posMax << endl; // PosType unit = W.Lattice.toUnit(posMax); // unit[0] -= round(unit[0]); // unit[1] -= round(unit[1]); // unit[2] -= round(unit[2]); // cerr << "Reduced position = " << unit << endl; // } int ibar = NLop.selectMove(Random(), Txy[iw]); if (ibar) { accepted.push_back(W[iw]); int iat = Txy[iw][ibar].PID; iatList.push_back(iat); accPos.push_back(W[iw]->R[iat] + Txy[iw][ibar].Delta); } } if (accepted.size()) { Psi.ratio(accepted,iatList, accPos, ratios, newG, newL); Psi.update(accepted,iatList); for (int i=0; i<accepted.size(); i++) accepted[i]->R[iatList[i]] = accPos[i]; W.NLMove_GPU (accepted, accPos, iatList); // HACK HACK HACK // Recompute the kinetic energy // Psi.gradLapl(W, grad, lapl); // H.evaluate (W, LocalEnergy); //W.copyWalkersToGPU(); } } // Now branch BranchTimer.start(); for (int iw=0; iw<nw; iw++) { RealType v2=0.0, v2bar=0.0; for(int iat=0; iat<nat; iat++) { v2 += dot(W.G[iat],W.G[iat]); RealType newscale = getDriftScale(m_tauovermass,newG[iw]); v2 += m_tauovermass * m_tauovermass * dot(newG[iw],newG[iw]); v2bar += newscale * newscale * dot(newG[iw],newG[iw]); } //RealType scNew = std::sqrt(V2bar[iw] / V2[iw]); RealType scNew = std::sqrt(v2bar/v2); RealType scOld = (CurrentStep == 1) ? scNew : W[iw]->getPropertyBase()[DRIFTSCALE]; W[iw]->getPropertyBase()[DRIFTSCALE] = scNew; // fprintf (stderr, "iw = %d scNew = %1.8f scOld = %1.8f\n", iw, scNew, scOld); RealType tauRatio = R2acc[iw] / R2prop[iw]; if (tauRatio < 0.5) cerr << " tauRatio = " << tauRatio << endl; RealType taueff = m_tauovermass * tauRatio; if (scaleweight) W[iw]->Weight *= branchEngine->branchWeightTau (LocalEnergy[iw], LocalEnergyOld[iw], scNew, scOld, taueff); else W[iw]->Weight *= branchEngine->branchWeight (LocalEnergy[iw], LocalEnergyOld[iw]); W[iw]->getPropertyBase()[R2ACCEPTED] = R2acc[iw]; W[iw]->getPropertyBase()[R2PROPOSED] = R2prop[iw]; } Mover->setMultiplicity(W.begin(), W.end()); branchEngine->branch(CurrentStep,W); nw = W.getActiveWalkers(); LocalEnergyOld.resize(nw); for (int iw=0; iw<nw; iw++) LocalEnergyOld[iw] = W[iw]->getPropertyBase()[LOCALENERGY]; BranchTimer.stop(); } while(step<nSteps); Psi.recompute(W, true); double accept_ratio = (double)nAccept/(double)(nAccept+nReject); Estimators->stopBlock(accept_ratio); nAcceptTot += nAccept; nRejectTot += nReject; ++block; recordBlock(block); } while(block<nBlocks); //finalize a qmc section return finalize(block); }
bool DMCcuda::runWithNonlocal() { resetRun(); Mover->MaxAge = 1; IndexType block = 0; IndexType nAcceptTot = 0; IndexType nRejectTot = 0; int nat = W.getTotalNum(); int nw = W.getActiveWalkers(); vector<RealType> LocalEnergy(nw), LocalEnergyOld(nw), oldScale(nw), newScale(nw); vector<PosType> delpos(nw); vector<PosType> dr(nw); vector<PosType> newpos(nw); vector<ValueType> ratios(nw), rplus(nw), rminus(nw), R2prop(nw), R2acc(nw); vector<PosType> oldG(nw), newG(nw); vector<ValueType> oldL(nw), newL(nw); vector<Walker_t*> accepted(nw); Matrix<ValueType> lapl(nw, nat); Matrix<GradType> grad(nw, nat); vector<vector<NonLocalData> > Txy(nw); for (int iw=0; iw<nw; iw++) W[iw]->Weight = 1.0; do { IndexType step = 0; nAccept = nReject = 0; Estimators->startBlock(nSteps); do { step++; CurrentStep++; nw = W.getActiveWalkers(); LocalEnergy.resize(nw); oldScale.resize(nw); newScale.resize(nw); delpos.resize(nw); dr.resize(nw); newpos.resize(nw); ratios.resize(nw); rplus.resize(nw); rminus.resize(nw); oldG.resize(nw); newG.resize(nw); oldL.resize(nw); newL.resize(nw); accepted.resize(nw); lapl.resize(nw, nat); grad.resize(nw, nat); R2prop.resize(nw,0.0); R2acc.resize(nw,0.0); W.updateLists_GPU(); Txy.resize(nw); for (int iw=0; iw<nw; iw++) { Txy[iw].clear(); Txy[iw].push_back(NonLocalData(-1, 1.0, PosType())); W[iw]->Age++; } for(int iat=0; iat<nat; iat++) { Psi.getGradient (W, iat, oldG); //create a 3N-Dimensional Gaussian with variance=1 makeGaussRandomWithEngine(delpos,Random); for(int iw=0; iw<nw; iw++) { delpos[iw] *= m_sqrttau; oldScale[iw] = getDriftScale(m_tauovermass,oldG[iw]); dr[iw] = delpos[iw] + (oldScale[iw]*oldG[iw]); newpos[iw]=W[iw]->R[iat] + dr[iw]; ratios[iw] = 1.0; R2prop[iw] += dot(delpos[iw], delpos[iw]); } W.proposeMove_GPU(newpos, iat); Psi.ratio(W,iat,ratios,newG, newL); accepted.clear(); vector<bool> acc(nw, false); for(int iw=0; iw<nw; ++iw) { PosType drOld = newpos[iw] - (W[iw]->R[iat] + oldScale[iw]*oldG[iw]); RealType logGf = -m_oneover2tau * dot(drOld, drOld); newScale[iw] = getDriftScale(m_tauovermass,newG[iw]); PosType drNew = (newpos[iw] + newScale[iw]*newG[iw]) - W[iw]->R[iat]; RealType logGb = -m_oneover2tau * dot(drNew, drNew); RealType x = logGb - logGf; RealType prob = ratios[iw]*ratios[iw]*std::exp(x); if(Random() < prob && ratios[iw] > 0.0) { accepted.push_back(W[iw]); nAccept++; W[iw]->R[iat] = newpos[iw]; W[iw]->Age = 0; acc[iw] = true; R2acc[iw] += dot(delpos[iw], delpos[iw]); } else nReject++; } W.acceptMove_GPU(acc); if (accepted.size()) Psi.update(accepted,iat); } for (int iw=0; iw < nw; iw++) if (W[iw]->Age) cerr << "Encountered stuck walker with iw=" << iw << endl; // Psi.recompute(W, false); Psi.gradLapl(W, grad, lapl); H.evaluate (W, LocalEnergy, Txy); if (CurrentStep == 1) LocalEnergyOld = LocalEnergy; // Now, attempt nonlocal move accepted.clear(); vector<int> iatList; vector<PosType> accPos; for (int iw=0; iw<nw; iw++) { int ibar = NLop.selectMove(Random(), Txy[iw]); // cerr << "Txy[iw].size() = " << Txy[iw].size() << endl; if (ibar) { accepted.push_back(W[iw]); int iat = Txy[iw][ibar].PID; iatList.push_back(iat); accPos.push_back(W[iw]->R[iat] + Txy[iw][ibar].Delta); } } if (accepted.size()) { // W.proposeMove_GPU(newpos, iatList); Psi.ratio(accepted,iatList, accPos, ratios, newG, newL); Psi.update(accepted,iatList); for (int i=0; i<accepted.size(); i++) accepted[i]->R[iatList[i]] = accPos[i]; W.copyWalkersToGPU(); } // Now branch for (int iw=0; iw<nw; iw++) { W[iw]->Weight *= branchEngine->branchWeight(LocalEnergy[iw], LocalEnergyOld[iw]); W[iw]->getPropertyBase()[R2ACCEPTED] = R2acc[iw]; W[iw]->getPropertyBase()[R2PROPOSED] = R2prop[iw]; } Mover->setMultiplicity(W.begin(), W.end()); branchEngine->branch(CurrentStep,W); nw = W.getActiveWalkers(); LocalEnergyOld.resize(nw); for (int iw=0; iw<nw; iw++) LocalEnergyOld[iw] = W[iw]->getPropertyBase()[LOCALENERGY]; } while(step<nSteps); Psi.recompute(W, true); double accept_ratio = (double)nAccept/(double)(nAccept+nReject); Estimators->stopBlock(accept_ratio); nAcceptTot += nAccept; nRejectTot += nReject; ++block; recordBlock(block); } while(block<nBlocks); //finalize a qmc section return finalize(block); }
SPOSetBase* EinsplineSetBuilder::createSPOSet(xmlNodePtr cur) { //use 2 bohr as the default when truncated orbitals are used based on the extend of the ions BufferLayer=2.0; OhmmsAttributeSet attribs; int numOrbs = 0; qafm=0; int sortBands(1); string sourceName; string spo_prec("double"); string truncate("no"); #if defined(QMC_CUDA) string useGPU="yes"; #else string useGPU="no"; #endif attribs.add (H5FileName, "href"); attribs.add (TileFactor, "tile"); attribs.add (sortBands, "sort"); attribs.add (qafm, "afmshift"); attribs.add (TileMatrix, "tilematrix"); attribs.add (TwistNum, "twistnum"); attribs.add (givenTwist, "twist"); attribs.add (sourceName, "source"); attribs.add (MeshFactor, "meshfactor"); attribs.add (useGPU, "gpu"); attribs.add (spo_prec, "precision"); attribs.add (truncate, "truncate"); attribs.add (BufferLayer, "buffer"); attribs.put (XMLRoot); attribs.add (numOrbs, "size"); attribs.add (numOrbs, "norbs"); attribs.put (cur); /////////////////////////////////////////////// // Read occupation information from XML file // /////////////////////////////////////////////// cur = cur->children; int spinSet = -1; vector<int> Occ_Old(0,0); Occ.resize(0,0); bool NewOcc(false); while (cur != NULL) { string cname((const char*)(cur->name)); if(cname == "occupation") { string occ_mode("ground"); occ_format="energy"; particle_hole_pairs=0; OhmmsAttributeSet oAttrib; oAttrib.add(occ_mode,"mode"); oAttrib.add(spinSet,"spindataset"); oAttrib.add(occ_format,"format"); oAttrib.add(particle_hole_pairs,"pairs"); oAttrib.put(cur); if(occ_mode == "excited") { putContent(Occ,cur); } else if(occ_mode != "ground") { app_error() << "Only ground state occupation currently supported " << "in EinsplineSetBuilder.\n"; APP_ABORT("EinsplineSetBuilder::createSPOSet"); } } cur = cur->next; } if (Occ != Occ_Old) { NewOcc=true; Occ_Old = Occ; } else NewOcc=false; #if defined(QMC_CUDA) app_log() << "\t QMC_CUDA=1 Overwriting the precision of the einspline storage on the host.\n"; spo_prec="double"; //overwrite #endif H5OrbSet aset(H5FileName, spinSet, numOrbs); std::map<H5OrbSet,SPOSetBase*,H5OrbSet>::iterator iter; iter = SPOSetMap.find (aset); if ((iter != SPOSetMap.end() ) && (!NewOcc) && (qafm==0)) { qafm=0; app_log() << "SPOSet parameters match in EinsplineSetBuilder: " << "cloning EinsplineSet object.\n"; return iter->second->makeClone(); } // The tiling can be set by a simple vector, (e.g. 2x2x2), or by a // full 3x3 matrix of integers. If the tilematrix was not set in // the input file... bool matrixNotSet = true; for (int i=0; i<3; i++) for (int j=0; j<3; j++) matrixNotSet = matrixNotSet && (TileMatrix(i,j) == 0); // then set the matrix to what may have been specified in the // tiling vector if (matrixNotSet) for (int i=0; i<3; i++) for (int j=0; j<3; j++) TileMatrix(i,j) = (i==j) ? TileFactor(i) : 0; if (myComm->rank() == 0) fprintf (stderr, " [ %2d %2d %2d\n %2d %2d %2d\n %2d %2d %2d ]\n", TileMatrix(0,0), TileMatrix(0,1), TileMatrix(0,2), TileMatrix(1,0), TileMatrix(1,1), TileMatrix(1,2), TileMatrix(2,0), TileMatrix(2,1), TileMatrix(2,2)); if (numOrbs == 0) { app_error() << "You must specify the number of orbitals in the input file.\n"; APP_ABORT("EinsplineSetBuilder::createSPOSet"); } else app_log() << " Reading " << numOrbs << " orbitals from HDF5 file.\n"; Timer mytimer; mytimer.restart(); ///////////////////////////////////////////////////////////////// // Read the basic orbital information, without reading all the // // orbitals themselves. // ///////////////////////////////////////////////////////////////// if (myComm->rank() == 0) if (!ReadOrbitalInfo()) { app_error() << "Error reading orbital info from HDF5 file. Aborting.\n"; APP_ABORT("EinsplineSetBuilder::createSPOSet"); } app_log() << "TIMER EinsplineSetBuilder::ReadOrbitalInfo " << mytimer.elapsed() << endl; myComm->barrier(); mytimer.restart(); BroadcastOrbitalInfo(); app_log() << "TIMER EinsplineSetBuilder::BroadcastOrbitalInfo " << mytimer.elapsed() << endl; app_log().flush(); /////////////////////////////////////////////////////////////////// // Now, analyze the k-point mesh to figure out the what k-points // // are needed // /////////////////////////////////////////////////////////////////// PrimCell.set(Lattice); SuperCell.set(SuperLattice); for (int iat=0; iat<AtomicOrbitals.size(); iat++) AtomicOrbitals[iat].Lattice = Lattice; // Copy supercell into the ParticleSets // app_log() << "Overwriting XML lattice with that from the ESHDF file.\n"; // PtclPoolType::iterator piter; // for(piter = ParticleSets.begin(); piter != ParticleSets.end(); piter++) // piter->second->Lattice.copy(SuperCell); AnalyzeTwists2(); ////////////////////////////////// // Create the OrbitalSet object ////////////////////////////////// if (HaveLocalizedOrbs) OrbitalSet = new EinsplineSetLocal; #ifdef QMC_CUDA else if (AtomicOrbitals.size() > 0) { if (UseRealOrbitals) OrbitalSet = new EinsplineSetHybrid<double>; else OrbitalSet = new EinsplineSetHybrid<complex<double> >; } #endif else { if (UseRealOrbitals) OrbitalSet = new EinsplineSetExtended<double>; else OrbitalSet = new EinsplineSetExtended<complex<double> >; } //set the internal parameters setTiling(OrbitalSet,numOrbs); if (HaveLocalizedOrbs) { EinsplineSetLocal *restrict orbitalSet = dynamic_cast<EinsplineSetLocal*>(OrbitalSet); #pragma omp critical(read_einspline_orbs) { if ((spinSet == LastSpinSet) && (numOrbs <= NumOrbitalsRead) && (!NewOcc) ) CopyBands(numOrbs); else { // Now, figure out occupation for the bands and read them OccupyBands(spinSet, sortBands); ReadBands (spinSet, orbitalSet); } } // Now, store what we have read LastOrbitalSet = OrbitalSet; LastSpinSet = spinSet; NumOrbitalsRead = numOrbs; } else // Otherwise, use EinsplineSetExtended { mytimer.restart(); bool use_single= (spo_prec == "single" || spo_prec == "float"); if (UseRealOrbitals) { OccupyBands(spinSet, sortBands); //check if a matching BsplineReaderBase exists BsplineReaderBase* spline_reader=0; //if(TargetPtcl.Lattice.SuperCellEnum != SUPERCELL_BULK && truncate=="yes") if(truncate=="yes") { if(use_single) { if(TargetPtcl.Lattice.SuperCellEnum == SUPERCELL_OPEN) spline_reader= new SplineMixedAdoptorReader<SplineOpenAdoptor<float,double,3> >(this); else if(TargetPtcl.Lattice.SuperCellEnum == SUPERCELL_SLAB) spline_reader= new SplineMixedAdoptorReader<SplineMixedAdoptor<float,double,3> >(this); else spline_reader= new SplineAdoptorReader<SplineR2RAdoptor<float,double,3> >(this); } else { if(TargetPtcl.Lattice.SuperCellEnum == SUPERCELL_OPEN) spline_reader= new SplineMixedAdoptorReader<SplineOpenAdoptor<double,double,3> >(this); else if(TargetPtcl.Lattice.SuperCellEnum == SUPERCELL_SLAB) spline_reader= new SplineMixedAdoptorReader<SplineMixedAdoptor<double,double,3> >(this); else spline_reader= new SplineAdoptorReader<SplineR2RAdoptor<double,double,3> >(this); } } else { if(use_single) spline_reader= new SplineAdoptorReader<SplineR2RAdoptor<float,double,3> >(this); } if(spline_reader) { HasCoreOrbs=bcastSortBands(NumDistinctOrbitals,myComm->rank()==0); SPOSetBase* bspline_zd=spline_reader->create_spline_set(spinSet,OrbitalSet); delete spline_reader; if(bspline_zd) SPOSetMap[aset] = bspline_zd; return bspline_zd; } else { app_log() << ">>>> Creating EinsplineSetExtended<double> <<<< " << endl; EinsplineSetExtended<double> *restrict orbitalSet = dynamic_cast<EinsplineSetExtended<double>* > (OrbitalSet); if (Format == ESHDF) ReadBands_ESHDF(spinSet,orbitalSet); else ReadBands(spinSet, orbitalSet); } } else { OccupyBands(spinSet, sortBands); BsplineReaderBase* spline_reader=0; if(truncate == "yes") { app_log() << " Truncated orbitals with multiple kpoints are not supported yet!" << endl; } if(use_single) { #if defined(QMC_COMPLEX) spline_reader= new SplineAdoptorReader<SplineC2CPackedAdoptor<float,double,3> >(this); #else spline_reader= new SplineAdoptorReader<SplineC2RPackedAdoptor<float,double,3> >(this); #endif } if(spline_reader) { RotateBands_ESHDF(spinSet, dynamic_cast<EinsplineSetExtended<complex<double> >*>(OrbitalSet)); HasCoreOrbs=bcastSortBands(NumDistinctOrbitals,myComm->rank()==0); SPOSetBase* bspline_zd=spline_reader->create_spline_set(spinSet,OrbitalSet); delete spline_reader; if(bspline_zd) SPOSetMap[aset] = bspline_zd; return bspline_zd; } else { EinsplineSetExtended<complex<double> > *restrict orbitalSet = dynamic_cast<EinsplineSetExtended<complex<double> >*>(OrbitalSet); if (Format == ESHDF) ReadBands_ESHDF(spinSet,orbitalSet); else ReadBands(spinSet, orbitalSet); } } app_log() << "TIMER EinsplineSetBuilder::ReadBands " << mytimer.elapsed() << endl; } #ifndef QMC_COMPLEX if (myComm->rank()==0 && OrbitalSet->MuffinTins.size() > 0) { FILE *fout = fopen ("TestMuffins.dat", "w"); Vector<double> phi(numOrbs), lapl(numOrbs); Vector<PosType> grad(numOrbs); ParticleSet P; P.R.resize(6); for (int i=0; i<P.R.size(); i++) P.R[i] = PosType (0.0, 0.0, 0.0); PosType N = 0.25*PrimCell.a(0) + 0.25*PrimCell.a(1) + 0.25*PrimCell.a(2); for (double x=-1.0; x<=1.0; x+=0.0000500113412) { // for (double x=-0.003; x<=0.003; x+=0.0000011329343481381) { P.R[0] = x * (PrimCell.a(0) + 0.914*PrimCell.a(1) + 0.781413*PrimCell.a(2)); double r = std::sqrt(dot(P.R[0], P.R[0])); double rN = std::sqrt(dot(P.R[0]-N, P.R[0]-N)); OrbitalSet->evaluate(P, 0, phi, grad, lapl); // OrbitalSet->evaluate(P, 0, phi); fprintf (fout, "%1.12e ", r*x/std::fabs(x)); for (int j=0; j<numOrbs; j++) { double gmag = std::sqrt(dot(grad[j],grad[j])); fprintf (fout, "%16.12e ", /*phi[j]*phi[j]**/(-5.0/r -0.5*lapl[j]/phi[j])); // double E = -5.0/r -0.5*lapl[j]/phi[j]; fprintf (fout, "%16.12e ", phi[j]); fprintf (fout, "%16.12e ", gmag); } fprintf (fout, "\n"); } fclose(fout); } #endif SPOSetMap[aset] = OrbitalSet; if (sourceName.size() && (ParticleSets.find(sourceName) == ParticleSets.end())) { app_log() << " EinsplineSetBuilder creates a ParticleSet " << sourceName << endl; ParticleSet* ions=new ParticleSet; ions->Lattice=TargetPtcl.Lattice; ESHDFIonsParser ap(*ions,H5FileID,myComm); ap.put(XMLRoot); ap.expand(TileMatrix); ions->setName(sourceName); ParticleSets[sourceName]=ions; //overwrite the lattice and assign random if(TargetPtcl.Lattice.SuperCellEnum) { TargetPtcl.Lattice=ions->Lattice; makeUniformRandom(TargetPtcl.R); TargetPtcl.R.setUnit(PosUnit::LatticeUnit); TargetPtcl.convert2Cart(TargetPtcl.R); TargetPtcl.createSK(); } } #ifdef QMC_CUDA if (useGPU == "yes" || useGPU == "1") { app_log() << "Initializing GPU data structures.\n"; OrbitalSet->initGPU(); } #endif return OrbitalSet; }
void STWarp<T>::warpingIteration( const WarpingField<T> &warpField, const Video<T> &Bx, const Video<T> &By, const Video<T> &Bt, const Video<T> &C, WarpingField<T> &dWarpField) { int height = dimensions[0]; int width = dimensions[1]; int nFrames = dimensions[2]; // int nChannels = dimensions[3]; if(params.verbosity > 1) { fprintf(stderr, " - preparing linear system..."); } // Differentiate (u,v,w)+d(u,v,w) in x,y,t WarpingField<T> warpDX(warpField.size()); WarpingField<T> warpDY(warpField.size()); WarpingField<T> warpDT(warpField.size()); WarpingField<T> warpTotal(warpField); warpTotal.add(dWarpField); VideoProcessing::dx(warpTotal,warpDX); VideoProcessing::dy(warpTotal,warpDY); VideoProcessing::dt(warpTotal,warpDT); // Compute smoothness cost Video<T> smoothCost(height, width, nFrames, 9); Video<T> lapl(warpField.size()); computeSmoothCost(warpDX, warpDY, warpDT, warpField, smoothCost, lapl); computeOcclusion(warpTotal, C, occlusion); // Compute data cost Video<T> dataCost; if(params.useFeatures){ dataCost = Video<T>(height,width,nFrames,2); }else{ dataCost = Video<T>(height,width,nFrames,1); } computeDataCost(Bx, By, Bt, C, dWarpField, occlusion, dataCost); // Prepare system Video<T> CBx(height,width,nFrames,1); Video<T> CBy(height,width,nFrames,1); Video<T> CBt(height,width,nFrames,1); Video<T> Bx2(height,width,nFrames,1); Video<T> By2(height,width,nFrames,1); Video<T> Bt2(height,width,nFrames,1); Video<T> Bxy(height,width,nFrames,1); Video<T> Bxt(height,width,nFrames,1); Video<T> Byt(height,width,nFrames,1); prepareLinearSystem(Bx, By, Bt, C, lapl, dataCost, CBx, CBy, CBt, Bx2, By2, Bt2, Bxy, Bxt, Byt); if(params.verbosity > 1) { fprintf(stderr, "done.\n"); } if(params.verbosity > 1) { fprintf(stderr, " - SOR..."); } sor( dataCost, smoothCost, lapl, CBx, CBy, CBt, Bx2, By2, Bt2, Bxy, Bxt, Byt, dWarpField); if(params.verbosity > 1) { fprintf(stderr, "done.\n"); } }