Ejemplo n.º 1
0
/*
 * Solves lapl(u) x = b, for x, given b, using Conjugate Gradient
 */
void
cg(latparams lp, field **x, field **b, link **g)
{
  size_t L = lp.L;
  int max_iter = 100;
  float tol = 1e-9;

  /* Temporary fields needed for CG */
  field **r = new_field(lp);
  field **p = new_field(lp);
  field **Ap = new_field(lp);

  /* Initial residual and p-vector */
  lapl(lp, r, x, g);
  xmy(lp, b, r);
  xeqy(lp, p, r);

  /* Initial r-norm and b-norm */
  float rr = xdotx(lp, r);  
  float bb = xdotx(lp, b);
  double t_lapl = 0;
  int iter = 0;
  for(iter=0; iter<max_iter; iter++) {
    printf(" %6d, res = %+e\n", iter, rr/bb);
    if(sqrt(rr/bb) < tol)
      break;
    double t = stop_watch(0);
    lapl(lp, Ap, p, g);
    t_lapl += stop_watch(t);
    float pAp = xdoty(lp, p, Ap);
    float alpha = rr/pAp;
    axpy(lp, alpha, p, x);
    axpy(lp, -alpha, Ap, r);
    float r1r1 = xdotx(lp, r);
    float beta = r1r1/rr;
    xpay(lp, r, beta, p);
    rr = r1r1;
  }

  /* Recompute residual after convergence */
  lapl(lp, r, x, g);
  xmy(lp, b, r);
  rr = xdotx(lp, r);

  double beta_fp = 50*((double)L*L*L)/(t_lapl/(double)iter)*1e-9;
  double beta_io = 40*((double)L*L*L)/(t_lapl/(double)iter)*1e-9;
  printf(" Converged after %6d iterations, res = %+e\n", iter, rr/bb);  
  printf(" Time in lapl(): %+6.3e sec/call, %4.2e Gflop/s, %4.2e GB/s\n",
	 t_lapl/(double)iter, beta_fp, beta_io);  

  del_field(r);
  del_field(p);
  del_field(Ap);
  return;
}
Ejemplo n.º 2
0
/*
 * Solves lapl(u) x = b, for x, given b, using Conjugate Gradient
 */
void
cg(size_t L, _Complex float *x, _Complex float *b, _Complex float *u)
{
  int max_iter = 100;
  float tol = 1e-6;

  /* Temporary fields needed for CG */
  _Complex float *r = new_field(L);
  _Complex float *p = new_field(L);
  _Complex float *Ap = new_field(L);

  /* Initial residual and p-vector */
  lapl(L, r, x, u);
  xmy(L, b, r);
  xeqy(L, p, r);

  /* Initial r-norm and b-norm */
  float rr = xdotx(L, r);  
  float bb = xdotx(L, b);
  double t_lapl = 0;
  int iter = 0;
  for(iter=0; iter<max_iter; iter++) {
    printf(" %6d, res = %+e\n", iter, rr/bb);
    if(sqrt(rr/bb) < tol)
      break;
    double t = stop_watch(0);
    lapl(L, Ap, p, u);
    t_lapl += stop_watch(t);
    float pAp = xdoty(L, p, Ap);
    float alpha = rr/pAp;
    axpy(L, alpha, p, x);
    axpy(L, -alpha, Ap, r);
    float r1r1 = xdotx(L, r);
    float beta = r1r1/rr;
    xpay(L, r, beta, p);
    rr = r1r1;
  }

  /* Recompute residual after convergence */
  lapl(L, r, x, u);
  xmy(L, b, r);
  rr = xdotx(L, r);

  double beta_fp = 34*L*L/(t_lapl/(double)iter)*1e-9;
  double beta_io = 32*L*L/(t_lapl/(double)iter)*1e-9;
  printf(" Converged after %6d iterations, res = %+e\n", iter, rr/bb);  
  printf(" Time in lapl(): %+6.3e sec/call, %4.2e Gflop/s, %4.2e GB/s\n",
	 t_lapl/(double)iter, beta_fp, beta_io);  

  free(r);
  free(p);
  free(Ap);
  return;
}
Ejemplo n.º 3
0
int
main()
{
    DenseVector     sings;
    GeMat           deltas(3,2);

    std::vector<GeMat> _deltas;
    Function        x2f(x2, sings);
    Function        onef(one, sings);
    Function        x3f(x3, sings);
    Function        cosf(mycos, sings);
    Function        sinf(mysin, sings);
    Function        expf(myexp, sings);
    Basis           basis(4);
    basis.template enforceBoundaryCondition<lawa::DirichletBC>();
    IndexSet        indexset;
    Coeff1D         coeff;

    std::vector<Function> fvec;

    int rank = 2;
    int dim  = 64;

    for (int i=1; i<=32; ++i) {
        fvec.push_back(cosf);
        fvec.push_back(onef);
        fvec.push_back(x2f);
        fvec.push_back(onef);
    }

    SepCoeff        coeffs(rank, dim);
    IndexSetVec     indexsetvec(dim);
    lawa::SeparableFunctionD<T> F(fvec, rank, dim);
    MatInt                      derivs(rank, dim);
    for (int i=1; i<=rank; ++i) {
        for (int j=1; j<=dim; ++j) {
            derivs(i,j) = 0;
            _deltas.push_back(deltas);
        }
    }

    lawa::SeparableRHSD<T, Basis>   Fint(basis, F, _deltas, derivs);

    getFullIndexSet(basis, indexset, 2);

    std::cout << "The index set size is\n" << indexset.size()
              << std::endl;

    for (int l=0; (unsigned)l<indexsetvec.size(); ++l) {
        indexsetvec[l] = indexset;
    }

    /* Map */
    lawa::Mapwavind<Index1D> map(dim);
    map.rehash(50);

    genCoefficients(coeffs, Fint, indexsetvec);
    lawa::HTCoefficients<T, Basis>    f(dim, basis, map);
    lawa::HTCoefficients<T, Basis>    u(dim, basis, map);
    lawa::HTCoefficients<T, Basis>    r(dim, basis, map);

    Laplace1D       LaplaceBil(basis);
    RefLaplace1D    RefLaplaceBil(basis.refinementbasis);
    Identity1D      IdentityBil(basis);
    RefIdentity1D   RefIdentityBil(basis.refinementbasis);
    LOp_Lapl1D      lapl(basis, basis, RefLaplaceBil, LaplaceBil);

    Sepop A(lapl, dim, dim);

    lawa::Sepdiagscal<Basis>    S(dim, basis);
    setScaling(S, 0.5);

    lawa::HTAWGM_Params  params;
    params.maxit_pcg  = 100;
    params.maxit_awgm = 100;
    params.tol_awgm   = 1e-08;
    params.delta1_pcg = 1e-01;
    params.delta2_pcg = 1e-01;
    params.delta3_pcg = 1e-01;
    params.alpha      = 0.95;
    params.recompr    = 1e-02;
    params.gamma      = 0.1;
    params.theta      = 1e-08;


    std::cout << "HTAWGM params =\n";
    std::cout << params << std::endl;

    unsigned its;
    double   res;

    its = htawgm(A, S, u, Fint, indexsetvec, res, params);

    std::cout << "htawgm took " << its << " iterations to reach "
              << res << " accuracy" << std::endl;
    std::cout << "Final scaling set to\n" << S << std::endl;

    return 0;
}
Ejemplo n.º 4
0
bool VMCcuda::run() {
    if (UseDrift == "yes")
        return runWithDrift();

    resetRun();
    IndexType block = 0;
    IndexType nAcceptTot = 0;
    IndexType nRejectTot = 0;
    IndexType updatePeriod= (QMCDriverMode[QMC_UPDATE_MODE])
                            ? Period4CheckProperties
                            : (nBlocks+1)*nSteps;

    int nat = W.getTotalNum();
    int nw  = W.getActiveWalkers();

    vector<RealType>  LocalEnergy(nw);
    vector<PosType>   delpos(nw);
    vector<PosType>   newpos(nw);
    vector<ValueType> ratios(nw);
    vector<GradType>  oldG(nw), newG(nw);
    vector<ValueType> oldL(nw), newL(nw);
    vector<Walker_t*> accepted(nw);
    Matrix<ValueType> lapl(nw, nat);
    Matrix<GradType>  grad(nw, nat);
    double Esum;

    // First do warmup steps
    for (int step=0; step<myWarmupSteps; step++) {
        for(int iat=0; iat<nat; ++iat)  {
            //create a 3N-Dimensional Gaussian with variance=1
            makeGaussRandomWithEngine(delpos,Random);
            for(int iw=0; iw<nw; ++iw) {
                PosType G = W[iw]->Grad[iat];
                newpos[iw]=W[iw]->R[iat] + m_sqrttau*delpos[iw];
                ratios[iw] = 1.0;
            }
            W.proposeMove_GPU(newpos, iat);

            Psi.ratio(W,iat,ratios,newG, newL);

            accepted.clear();
            vector<bool> acc(nw, false);
            for(int iw=0; iw<nw; ++iw) {
                if(ratios[iw]*ratios[iw] > Random()) {
                    accepted.push_back(W[iw]);
                    nAccept++;
                    W[iw]->R[iat] = newpos[iw];
                    acc[iw] = true;
                }
                else
                    nReject++;
            }
            W.acceptMove_GPU(acc);
            if (accepted.size())
                Psi.update(accepted,iat);
        }
    }

    do {
        IndexType step = 0;
        nAccept = nReject = 0;
        Esum = 0.0;
        Estimators->startBlock(nSteps);
        do
        {
            ++step;
            ++CurrentStep;
            for (int isub=0; isub<nSubSteps; isub++) {
                for(int iat=0; iat<nat; ++iat)  {
                    //create a 3N-Dimensional Gaussian with variance=1
                    makeGaussRandomWithEngine(delpos,Random);
                    for(int iw=0; iw<nw; ++iw) {
                        PosType G = W[iw]->Grad[iat];
                        newpos[iw]=W[iw]->R[iat] + m_sqrttau*delpos[iw];
                        ratios[iw] = 1.0;
                    }
                    W.proposeMove_GPU(newpos, iat);

                    Psi.ratio(W,iat,ratios,newG, newL);

                    accepted.clear();
                    vector<bool> acc(nw, false);
                    for(int iw=0; iw<nw; ++iw) {
                        if(ratios[iw]*ratios[iw] > Random()) {
                            accepted.push_back(W[iw]);
                            nAccept++;
                            W[iw]->R[iat] = newpos[iw];
                            acc[iw] = true;
                        }
                        else
                            nReject++;
                    }
                    W.acceptMove_GPU(acc);
                    if (accepted.size())
                        Psi.update(accepted,iat);
                }
            }
            Psi.gradLapl(W, grad, lapl);
            H.evaluate (W, LocalEnergy);
            if (myPeriod4WalkerDump && (CurrentStep % myPeriod4WalkerDump)==0)
                W.saveEnsemble();
            Estimators->accumulate(W);
        } while(step<nSteps);
        Psi.recompute(W);

        // vector<RealType> logPsi(W.WalkerList.size(), 0.0);
        // Psi.evaluateLog(W, logPsi);

        double accept_ratio = (double)nAccept/(double)(nAccept+nReject);
        Estimators->stopBlock(accept_ratio);

        nAcceptTot += nAccept;
        nRejectTot += nReject;
        ++block;
        recordBlock(block);
    } while(block<nBlocks);

    //Mover->stopRun();

    //finalize a qmc section
    return finalize(block);
}
Ejemplo n.º 5
0
bool VMCcuda::runWithDrift()
{
    resetRun();
    IndexType block = 0;
    IndexType nAcceptTot = 0;
    IndexType nRejectTot = 0;
    int nat = W.getTotalNum();
    int nw  = W.getActiveWalkers();

    vector<RealType>  LocalEnergy(nw), oldScale(nw), newScale(nw);
    vector<PosType>   delpos(nw);
    vector<PosType>   dr(nw);
    vector<PosType>   newpos(nw);
    vector<ValueType> ratios(nw), rplus(nw), rminus(nw);
    vector<PosType>  oldG(nw), newG(nw);
    vector<ValueType> oldL(nw), newL(nw);
    vector<Walker_t*> accepted(nw);
    Matrix<ValueType> lapl(nw, nat);
    Matrix<GradType>  grad(nw, nat);

    // First, do warmup steps
    for (int step=0; step<myWarmupSteps; step++) {
        for(int iat=0; iat<nat; iat++) {
            Psi.getGradient (W, iat, oldG);

            //create a 3N-Dimensional Gaussian with variance=1
            makeGaussRandomWithEngine(delpos,Random);
            for(int iw=0; iw<nw; iw++) {
                oldScale[iw] = getDriftScale(m_tauovermass,oldG[iw]);
                dr[iw] = (m_sqrttau*delpos[iw]) + (oldScale[iw]*oldG[iw]);
                newpos[iw]=W[iw]->R[iat] + dr[iw];
                ratios[iw] = 1.0;
            }
            W.proposeMove_GPU(newpos, iat);

            Psi.ratio(W,iat,ratios,newG, newL);

            accepted.clear();
            vector<bool> acc(nw, false);
            for(int iw=0; iw<nw; ++iw) {
                PosType drOld =
                    newpos[iw] - (W[iw]->R[iat] + oldScale[iw]*oldG[iw]);
                RealType logGf = -m_oneover2tau * dot(drOld, drOld);
                newScale[iw]   = getDriftScale(m_tauovermass,newG[iw]);
                PosType drNew  =
                    (newpos[iw] + newScale[iw]*newG[iw]) - W[iw]->R[iat];

                RealType logGb =  -m_oneover2tau * dot(drNew, drNew);
                RealType x = logGb - logGf;
                RealType prob = ratios[iw]*ratios[iw]*std::exp(x);

                if(Random() < prob) {
                    accepted.push_back(W[iw]);
                    nAccept++;
                    W[iw]->R[iat] = newpos[iw];
                    acc[iw] = true;
                }
                else
                    nReject++;
            }
            W.acceptMove_GPU(acc);
            if (accepted.size())
                Psi.update(accepted,iat);
        }
    }

    // Now do data collection steps
    do {
        IndexType step = 0;
        nAccept = nReject = 0;
        Estimators->startBlock(nSteps);
        do {
            step++;
            CurrentStep++;
            for (int isub=0; isub<nSubSteps; isub++) {
                for(int iat=0; iat<nat; iat++) {
                    Psi.getGradient (W, iat, oldG);

                    //create a 3N-Dimensional Gaussian with variance=1
                    makeGaussRandomWithEngine(delpos,Random);
                    for(int iw=0; iw<nw; iw++) {
                        oldScale[iw] = getDriftScale(m_tauovermass,oldG[iw]);
                        dr[iw] = (m_sqrttau*delpos[iw]) + (oldScale[iw]*oldG[iw]);
                        newpos[iw]=W[iw]->R[iat] + dr[iw];
                        ratios[iw] = 1.0;
                    }
                    W.proposeMove_GPU(newpos, iat);

                    Psi.ratio(W,iat,ratios,newG, newL);

                    accepted.clear();
                    vector<bool> acc(nw, false);
                    for(int iw=0; iw<nw; ++iw) {
                        PosType drOld =
                            newpos[iw] - (W[iw]->R[iat] + oldScale[iw]*oldG[iw]);
                        // if (dot(drOld, drOld) > 25.0)
                        //   cerr << "Large drift encountered!  Old drift = " << drOld << endl;
                        RealType logGf = -m_oneover2tau * dot(drOld, drOld);
                        newScale[iw]   = getDriftScale(m_tauovermass,newG[iw]);
                        PosType drNew  =
                            (newpos[iw] + newScale[iw]*newG[iw]) - W[iw]->R[iat];
                        // if (dot(drNew, drNew) > 25.0)
                        //   cerr << "Large drift encountered!  Drift = " << drNew << endl;
                        RealType logGb =  -m_oneover2tau * dot(drNew, drNew);
                        RealType x = logGb - logGf;
                        RealType prob = ratios[iw]*ratios[iw]*std::exp(x);

                        if(Random() < prob) {
                            accepted.push_back(W[iw]);
                            nAccept++;
                            W[iw]->R[iat] = newpos[iw];
                            acc[iw] = true;
                        }
                        else
                            nReject++;
                    }
                    W.acceptMove_GPU(acc);
                    if (accepted.size())
                        Psi.update(accepted,iat);
                }
                // cerr << "Rank = " << myComm->rank() <<
                //   "  CurrentStep = " << CurrentStep << "  isub = " << isub << endl;
            }
            Psi.gradLapl(W, grad, lapl);
            H.evaluate (W, LocalEnergy);
            if (myPeriod4WalkerDump && (CurrentStep % myPeriod4WalkerDump)==0)
                W.saveEnsemble();
            Estimators->accumulate(W);
        } while(step<nSteps);
        Psi.recompute(W);

        double accept_ratio = (double)nAccept/(double)(nAccept+nReject);
        Estimators->stopBlock(accept_ratio);

        nAcceptTot += nAccept;
        nRejectTot += nReject;
        ++block;
        recordBlock(block);
    } while(block<nBlocks);
    //finalize a qmc section
    if (!myComm->rank())
        gpu::cuda_memory_manager.report();
    return finalize(block);
}
Ejemplo n.º 6
0
  bool DMCcuda::run() 
  { 
    bool NLmove = NonLocalMove == "yes";
    bool scaleweight = ScaleWeight == "yes";
    if (NLmove) 
      app_log() << "  Using Casula nonlocal moves in DMCcuda.\n";
    if (scaleweight)
      app_log() << "  Scaling weight per Umrigar/Nightengale.\n";
      
    resetRun();
    Mover->MaxAge = 1;
    IndexType block = 0;
    IndexType nAcceptTot = 0;
    IndexType nRejectTot = 0;
    int nat = W.getTotalNum();
    int nw  = W.getActiveWalkers();
    
    vector<RealType>  LocalEnergy(nw), LocalEnergyOld(nw), 
      oldScale(nw), newScale(nw);
    vector<PosType>   delpos(nw);
    vector<PosType>   dr(nw);
    vector<PosType>   newpos(nw);
    vector<ValueType> ratios(nw), rplus(nw), rminus(nw), R2prop(nw), R2acc(nw);
    vector<PosType>  oldG(nw), newG(nw);
    vector<ValueType> oldL(nw), newL(nw);
    vector<Walker_t*> accepted(nw);
    Matrix<ValueType> lapl(nw, nat);
    Matrix<GradType>  grad(nw, nat);
    vector<ValueType> V2(nw), V2bar(nw);
    vector<vector<NonLocalData> > Txy(nw);

    for (int iw=0; iw<nw; iw++)
      W[iw]->Weight = 1.0;
    do {
      IndexType step = 0;
      nAccept = nReject = 0;
      Estimators->startBlock(nSteps);
      
      do {
        step++;
	CurrentStep++;
	nw = W.getActiveWalkers();
	ResizeTimer.start();
	LocalEnergy.resize(nw);       oldScale.resize(nw);
	newScale.resize(nw);          delpos.resize(nw);
	dr.resize(nw);                newpos.resize(nw);
	ratios.resize(nw);            rplus.resize(nw);
	rminus.resize(nw);            oldG.resize(nw);
	newG.resize(nw);              oldL.resize(nw);
	newL.resize(nw);              accepted.resize(nw);
	lapl.resize(nw, nat);         grad.resize(nw, nat);
	R2prop.resize(nw,0.0);        R2acc.resize(nw,0.0);
	V2.resize(nw,0.0);            V2bar.resize(nw,0.0);

	W.updateLists_GPU();
	ResizeTimer.stop();

	if (NLmove) {
	  Txy.resize(nw);
	  for (int iw=0; iw<nw; iw++) {
	    Txy[iw].clear();
	    Txy[iw].push_back(NonLocalData(-1, 1.0, PosType()));
	  }
	}

	for (int iw=0; iw<nw; iw++)
	  W[iw]->Age++;
	
	DriftDiffuseTimer.start();
        for(int iat=0; iat<nat; iat++) {
	  Psi.getGradient (W, iat, oldG);

          //create a 3N-Dimensional Gaussian with variance=1
          makeGaussRandomWithEngine(delpos,Random);
          for(int iw=0; iw<nw; iw++) {
	    delpos[iw] *= m_sqrttau;
	    oldScale[iw] = getDriftScale(m_tauovermass,oldG[iw]);
	    dr[iw] = delpos[iw] + (oldScale[iw]*oldG[iw]);
            newpos[iw]=W[iw]->R[iat] + dr[iw];
	    ratios[iw] = 1.0;
	    R2prop[iw] += dot(delpos[iw], delpos[iw]);
	  }
	  W.proposeMove_GPU(newpos, iat);
	  
          Psi.ratio(W,iat,ratios,newG, newL);
	  	  
          accepted.clear();
	  vector<bool> acc(nw, false);
          for(int iw=0; iw<nw; ++iw) {
	    PosType drOld = 
	      newpos[iw] - (W[iw]->R[iat] + oldScale[iw]*oldG[iw]);
	    RealType logGf = -m_oneover2tau * dot(drOld, drOld);
	    newScale[iw]   = getDriftScale(m_tauovermass,newG[iw]);
	    PosType drNew  = 
	      (newpos[iw] + newScale[iw]*newG[iw]) - W[iw]->R[iat];
	    RealType logGb =  -m_oneover2tau * dot(drNew, drNew);
	    RealType x = logGb - logGf;
	    RealType prob = ratios[iw]*ratios[iw]*std::exp(x);
	    
            if(Random() < prob && ratios[iw] > 0.0) {
              accepted.push_back(W[iw]);
	      nAccept++;
	      W[iw]->R[iat] = newpos[iw];
	      W[iw]->Age = 0;
	      acc[iw] = true;
	      R2acc[iw] += dot(delpos[iw], delpos[iw]);
	      V2[iw]    += m_tauovermass * m_tauovermass * dot(newG[iw],newG[iw]);
	      V2bar[iw] +=  newScale[iw] *  newScale[iw] * dot(newG[iw],newG[iw]);
	    }
	    else {
	      nReject++;
	      V2[iw]    += m_tauovermass * m_tauovermass * dot(oldG[iw],oldG[iw]);
	      V2bar[iw] +=  oldScale[iw] *  oldScale[iw] * dot(oldG[iw],oldG[iw]);
	    }	      
	  }
	  W.acceptMove_GPU(acc);
	  if (accepted.size())
	    Psi.update(accepted,iat);
	}
	DriftDiffuseTimer.stop();
	//	Psi.recompute(W, false);
	Psi.gradLapl(W, grad, lapl);
	HTimer.start();
	if (NLmove)	  H.evaluate (W, LocalEnergy, Txy);
	else    	  H.evaluate (W, LocalEnergy);
	HTimer.stop();
// 	for (int iw=0; iw<nw; iw++) {
// 	  branchEngine->clampEnergy(LocalEnergy[iw]);
// 	  W[iw]->getPropertyBase()[LOCALENERGY] = LocalEnergy[iw];
// 	}
	if (CurrentStep == 1)
	  LocalEnergyOld = LocalEnergy;
	
	if (NLmove) {
	  // Now, attempt nonlocal move
	  accepted.clear();
	  vector<int> iatList;
	  vector<PosType> accPos;
	  for (int iw=0; iw<nw; iw++) {
	    /// HACK HACK HACK
// 	    if (LocalEnergy[iw] < -2300.0) {
// 	      cerr << "Walker " << iw << " has energy " 
// 		   << LocalEnergy[iw] << endl;;
// 	      double maxWeight = 0.0;
// 	      int elMax = -1;
// 	      PosType posMax;
// 	      for (int j=1; j<Txy[iw].size(); j++)
// 		if (std::fabs(Txy[iw][j].Weight) > std::fabs(maxWeight)) {
// 		  maxWeight = Txy[iw][j].Weight;
// 		  elMax = Txy[iw][j].PID;
// 		  posMax = W[iw]->R[elMax] + Txy[iw][j].Delta;
// 		}
// 	      cerr << "Maximum weight is " << maxWeight << " for electron " 
// 		   << elMax << " at position " << posMax << endl;
// 	      PosType unit = W.Lattice.toUnit(posMax);
// 	      unit[0] -= round(unit[0]);
// 	      unit[1] -= round(unit[1]);
// 	      unit[2] -= round(unit[2]);
// 	      cerr << "Reduced position = " << unit << endl;
// 	    }

	    int ibar = NLop.selectMove(Random(), Txy[iw]);
		    
	    if (ibar) {
	      accepted.push_back(W[iw]);
	      int iat = Txy[iw][ibar].PID;
	      iatList.push_back(iat);
	      accPos.push_back(W[iw]->R[iat] + Txy[iw][ibar].Delta);
	    }
	  }
	  if (accepted.size()) {
	    Psi.ratio(accepted,iatList, accPos, ratios, newG, newL);
	    Psi.update(accepted,iatList);
	    for (int i=0; i<accepted.size(); i++)
	      accepted[i]->R[iatList[i]] = accPos[i];
	    W.NLMove_GPU (accepted, accPos, iatList);
	    // HACK HACK HACK
	    // Recompute the kinetic energy
	    // Psi.gradLapl(W, grad, lapl);
	    // H.evaluate (W, LocalEnergy);
	    //W.copyWalkersToGPU();
	  }
	}

	// Now branch
	BranchTimer.start();
	for (int iw=0; iw<nw; iw++) {
	  RealType v2=0.0, v2bar=0.0;
	  for(int iat=0; iat<nat; iat++) {
	    v2 += dot(W.G[iat],W.G[iat]);
	    RealType newscale = getDriftScale(m_tauovermass,newG[iw]);
	    v2 += m_tauovermass * m_tauovermass * dot(newG[iw],newG[iw]);
	    v2bar +=  newscale * newscale * dot(newG[iw],newG[iw]);
	  }
	  //RealType scNew = std::sqrt(V2bar[iw] / V2[iw]);
	  RealType scNew = std::sqrt(v2bar/v2);
	  RealType scOld = (CurrentStep == 1) ? scNew : W[iw]->getPropertyBase()[DRIFTSCALE];

	  W[iw]->getPropertyBase()[DRIFTSCALE] = scNew;
	  // fprintf (stderr, "iw = %d  scNew = %1.8f  scOld = %1.8f\n", iw, scNew, scOld);
	  RealType tauRatio = R2acc[iw] / R2prop[iw];
	  if (tauRatio < 0.5) 
	    cerr << "  tauRatio = " << tauRatio << endl;
	  RealType taueff = m_tauovermass * tauRatio;
	  if (scaleweight) 
	    W[iw]->Weight *= branchEngine->branchWeightTau
	      (LocalEnergy[iw], LocalEnergyOld[iw], scNew, scOld, taueff);
	  else
	    W[iw]->Weight *= branchEngine->branchWeight
	      (LocalEnergy[iw], LocalEnergyOld[iw]);

	  W[iw]->getPropertyBase()[R2ACCEPTED] = R2acc[iw];
	  W[iw]->getPropertyBase()[R2PROPOSED] = R2prop[iw];
	  
	}
	Mover->setMultiplicity(W.begin(), W.end());
	branchEngine->branch(CurrentStep,W);
	nw = W.getActiveWalkers();
	LocalEnergyOld.resize(nw);
	for (int iw=0; iw<nw; iw++)
	  LocalEnergyOld[iw] = W[iw]->getPropertyBase()[LOCALENERGY];
	BranchTimer.stop();
      } while(step<nSteps);
      Psi.recompute(W, true);


      double accept_ratio = (double)nAccept/(double)(nAccept+nReject);
      Estimators->stopBlock(accept_ratio);

      nAcceptTot += nAccept;
      nRejectTot += nReject;
      ++block;
      
      recordBlock(block);
    } while(block<nBlocks);
    //finalize a qmc section
    return finalize(block);
  }
Ejemplo n.º 7
0
  bool DMCcuda::runWithNonlocal() 
  { 
    resetRun();
    Mover->MaxAge = 1;
    IndexType block = 0;
    IndexType nAcceptTot = 0;
    IndexType nRejectTot = 0;
    int nat = W.getTotalNum();
    int nw  = W.getActiveWalkers();
    
    vector<RealType>  LocalEnergy(nw), LocalEnergyOld(nw), 
      oldScale(nw), newScale(nw);
    vector<PosType>   delpos(nw);
    vector<PosType>   dr(nw);
    vector<PosType>   newpos(nw);
    vector<ValueType> ratios(nw), rplus(nw), rminus(nw), R2prop(nw), R2acc(nw);
    vector<PosType>  oldG(nw), newG(nw);
    vector<ValueType> oldL(nw), newL(nw);
    vector<Walker_t*> accepted(nw);
    Matrix<ValueType> lapl(nw, nat);
    Matrix<GradType>  grad(nw, nat);
    vector<vector<NonLocalData> > Txy(nw);
    for (int iw=0; iw<nw; iw++)
      W[iw]->Weight = 1.0;
    do {
      IndexType step = 0;
      nAccept = nReject = 0;
      Estimators->startBlock(nSteps);
      
      do {
        step++;
	CurrentStep++;
	nw = W.getActiveWalkers();
	LocalEnergy.resize(nw);    	oldScale.resize(nw);
	newScale.resize(nw);    	delpos.resize(nw);
	dr.resize(nw);                  newpos.resize(nw);
	ratios.resize(nw);	        rplus.resize(nw);
	rminus.resize(nw);              oldG.resize(nw);
	newG.resize(nw);                oldL.resize(nw);
	newL.resize(nw);                accepted.resize(nw);
	lapl.resize(nw, nat);           grad.resize(nw, nat);
	R2prop.resize(nw,0.0);          R2acc.resize(nw,0.0);

	W.updateLists_GPU();

	Txy.resize(nw);
	for (int iw=0; iw<nw; iw++) {
	  Txy[iw].clear();
	  Txy[iw].push_back(NonLocalData(-1, 1.0, PosType()));
	  W[iw]->Age++;
	}
	
        for(int iat=0; iat<nat; iat++) {
	  Psi.getGradient (W, iat, oldG);
	  
          //create a 3N-Dimensional Gaussian with variance=1
          makeGaussRandomWithEngine(delpos,Random);
          for(int iw=0; iw<nw; iw++) {
	    delpos[iw] *= m_sqrttau;
	    oldScale[iw] = getDriftScale(m_tauovermass,oldG[iw]);
	    dr[iw] = delpos[iw] + (oldScale[iw]*oldG[iw]);
            newpos[iw]=W[iw]->R[iat] + dr[iw];
	    ratios[iw] = 1.0;
	    R2prop[iw] += dot(delpos[iw], delpos[iw]);
	  }
	  W.proposeMove_GPU(newpos, iat);
	  
          Psi.ratio(W,iat,ratios,newG, newL);
	  	  
          accepted.clear();
	  vector<bool> acc(nw, false);
          for(int iw=0; iw<nw; ++iw) {
	    PosType drOld = 
	      newpos[iw] - (W[iw]->R[iat] + oldScale[iw]*oldG[iw]);
	    RealType logGf = -m_oneover2tau * dot(drOld, drOld);
	    newScale[iw]   = getDriftScale(m_tauovermass,newG[iw]);
	    PosType drNew  = 
	      (newpos[iw] + newScale[iw]*newG[iw]) - W[iw]->R[iat];
	    RealType logGb =  -m_oneover2tau * dot(drNew, drNew);
	    RealType x = logGb - logGf;
	    RealType prob = ratios[iw]*ratios[iw]*std::exp(x);
	    
            if(Random() < prob && ratios[iw] > 0.0) {
              accepted.push_back(W[iw]);
	      nAccept++;
	      W[iw]->R[iat] = newpos[iw];
	      W[iw]->Age = 0;
	      acc[iw] = true;
	      R2acc[iw] += dot(delpos[iw], delpos[iw]);
	    }
	    else 
	      nReject++;
	  }
	  W.acceptMove_GPU(acc);
	  if (accepted.size())
	    Psi.update(accepted,iat);
	}
	for (int iw=0; iw < nw; iw++) 
	  if (W[iw]->Age)
	    cerr << "Encountered stuck walker with iw=" << iw << endl;

	//	Psi.recompute(W, false);
	Psi.gradLapl(W, grad, lapl);
	H.evaluate (W, LocalEnergy, Txy);
	if (CurrentStep == 1)
	  LocalEnergyOld = LocalEnergy;
	
	// Now, attempt nonlocal move
	accepted.clear();
	vector<int> iatList;
	vector<PosType> accPos;
	for (int iw=0; iw<nw; iw++) {
	  int ibar = NLop.selectMove(Random(), Txy[iw]);
	  // cerr << "Txy[iw].size() = " << Txy[iw].size() << endl;
	  if (ibar) {
	    accepted.push_back(W[iw]);
	    int iat = Txy[iw][ibar].PID;
	    iatList.push_back(iat);
	    accPos.push_back(W[iw]->R[iat] + Txy[iw][ibar].Delta);
	  }
	}
	if (accepted.size()) {
	  //   W.proposeMove_GPU(newpos, iatList);
	  Psi.ratio(accepted,iatList, accPos, ratios, newG, newL);
	  Psi.update(accepted,iatList);
	  for (int i=0; i<accepted.size(); i++)
	    accepted[i]->R[iatList[i]] = accPos[i];
	  W.copyWalkersToGPU();
	}

	// Now branch
	for (int iw=0; iw<nw; iw++) {
	  W[iw]->Weight *= branchEngine->branchWeight(LocalEnergy[iw], LocalEnergyOld[iw]);
	  W[iw]->getPropertyBase()[R2ACCEPTED] = R2acc[iw];
	  W[iw]->getPropertyBase()[R2PROPOSED] = R2prop[iw];
	}
	Mover->setMultiplicity(W.begin(), W.end());
	branchEngine->branch(CurrentStep,W);
	nw = W.getActiveWalkers();
	LocalEnergyOld.resize(nw);
	for (int iw=0; iw<nw; iw++)
	  LocalEnergyOld[iw] = W[iw]->getPropertyBase()[LOCALENERGY];
      } while(step<nSteps);
      Psi.recompute(W, true);


      double accept_ratio = (double)nAccept/(double)(nAccept+nReject);
      Estimators->stopBlock(accept_ratio);

      nAcceptTot += nAccept;
      nRejectTot += nReject;
      ++block;
      
      recordBlock(block);
    } while(block<nBlocks);
    //finalize a qmc section
    return finalize(block);
  }
SPOSetBase*
EinsplineSetBuilder::createSPOSet(xmlNodePtr cur)
{
  //use 2 bohr as the default when truncated orbitals are used based on the extend of the ions
  BufferLayer=2.0;
  OhmmsAttributeSet attribs;
  int numOrbs = 0;
  qafm=0;
  int sortBands(1);
  string sourceName;
  string spo_prec("double");
  string truncate("no");
#if defined(QMC_CUDA)
  string useGPU="yes";
#else
  string useGPU="no";
#endif
  attribs.add (H5FileName, "href");
  attribs.add (TileFactor, "tile");
  attribs.add (sortBands,  "sort");
  attribs.add (qafm,  "afmshift");
  attribs.add (TileMatrix, "tilematrix");
  attribs.add (TwistNum,   "twistnum");
  attribs.add (givenTwist,   "twist");
  attribs.add (sourceName, "source");
  attribs.add (MeshFactor, "meshfactor");
  attribs.add (useGPU,     "gpu");
  attribs.add (spo_prec,   "precision");
  attribs.add (truncate,   "truncate");
  attribs.add (BufferLayer, "buffer");
  attribs.put (XMLRoot);
  attribs.add (numOrbs,    "size");
  attribs.add (numOrbs,    "norbs");
  attribs.put (cur);
  ///////////////////////////////////////////////
  // Read occupation information from XML file //
  ///////////////////////////////////////////////
  cur = cur->children;
  int spinSet = -1;
  vector<int> Occ_Old(0,0);
  Occ.resize(0,0);
  bool NewOcc(false);
  while (cur != NULL)
  {
    string cname((const char*)(cur->name));
    if(cname == "occupation")
    {
      string occ_mode("ground");
      occ_format="energy";
      particle_hole_pairs=0;
      OhmmsAttributeSet oAttrib;
      oAttrib.add(occ_mode,"mode");
      oAttrib.add(spinSet,"spindataset");
      oAttrib.add(occ_format,"format");
      oAttrib.add(particle_hole_pairs,"pairs");
      oAttrib.put(cur);
      if(occ_mode == "excited")
      {
        putContent(Occ,cur);
      }
      else
        if(occ_mode != "ground")
        {
          app_error() << "Only ground state occupation currently supported "
                      << "in EinsplineSetBuilder.\n";
          APP_ABORT("EinsplineSetBuilder::createSPOSet");
        }
    }
    cur = cur->next;
  }
  if (Occ != Occ_Old)
  {
    NewOcc=true;
    Occ_Old = Occ;
  }
  else
    NewOcc=false;
#if defined(QMC_CUDA)
  app_log() << "\t  QMC_CUDA=1 Overwriting the precision of the einspline storage on the host.\n";
  spo_prec="double"; //overwrite
#endif
  H5OrbSet aset(H5FileName, spinSet, numOrbs);
  std::map<H5OrbSet,SPOSetBase*,H5OrbSet>::iterator iter;
  iter = SPOSetMap.find (aset);
  if ((iter != SPOSetMap.end() ) && (!NewOcc) && (qafm==0))
  {
    qafm=0;
    app_log() << "SPOSet parameters match in EinsplineSetBuilder:  "
              << "cloning EinsplineSet object.\n";
    return iter->second->makeClone();
  }
  // The tiling can be set by a simple vector, (e.g. 2x2x2), or by a
  // full 3x3 matrix of integers.  If the tilematrix was not set in
  // the input file...
  bool matrixNotSet = true;
  for (int i=0; i<3; i++)
    for (int j=0; j<3; j++)
      matrixNotSet = matrixNotSet && (TileMatrix(i,j) == 0);
  // then set the matrix to what may have been specified in the
  // tiling vector
  if (matrixNotSet)
    for (int i=0; i<3; i++)
      for (int j=0; j<3; j++)
        TileMatrix(i,j) = (i==j) ? TileFactor(i) : 0;
  if (myComm->rank() == 0)
    fprintf (stderr, " [ %2d %2d %2d\n   %2d %2d %2d\n   %2d %2d %2d ]\n",
             TileMatrix(0,0), TileMatrix(0,1), TileMatrix(0,2),
             TileMatrix(1,0), TileMatrix(1,1), TileMatrix(1,2),
             TileMatrix(2,0), TileMatrix(2,1), TileMatrix(2,2));
  if (numOrbs == 0)
  {
    app_error() << "You must specify the number of orbitals in the input file.\n";
    APP_ABORT("EinsplineSetBuilder::createSPOSet");
  }
  else
    app_log() << "  Reading " << numOrbs << " orbitals from HDF5 file.\n";
  Timer mytimer;
  mytimer.restart();
  /////////////////////////////////////////////////////////////////
  // Read the basic orbital information, without reading all the //
  // orbitals themselves.                                        //
  /////////////////////////////////////////////////////////////////
  if (myComm->rank() == 0)
    if (!ReadOrbitalInfo())
    {
      app_error() << "Error reading orbital info from HDF5 file.  Aborting.\n";
      APP_ABORT("EinsplineSetBuilder::createSPOSet");
    }
  app_log() <<  "TIMER  EinsplineSetBuilder::ReadOrbitalInfo " << mytimer.elapsed() << endl;
  myComm->barrier();
  mytimer.restart();
  BroadcastOrbitalInfo();
  app_log() <<  "TIMER  EinsplineSetBuilder::BroadcastOrbitalInfo " << mytimer.elapsed() << endl;
  app_log().flush();
  ///////////////////////////////////////////////////////////////////
  // Now, analyze the k-point mesh to figure out the what k-points //
  // are needed                                                    //
  ///////////////////////////////////////////////////////////////////
  PrimCell.set(Lattice);
  SuperCell.set(SuperLattice);
  for (int iat=0; iat<AtomicOrbitals.size(); iat++)
    AtomicOrbitals[iat].Lattice = Lattice;
  // Copy supercell into the ParticleSets
//     app_log() << "Overwriting XML lattice with that from the ESHDF file.\n";
//     PtclPoolType::iterator piter;
//     for(piter = ParticleSets.begin(); piter != ParticleSets.end(); piter++)
//       piter->second->Lattice.copy(SuperCell);
  AnalyzeTwists2();
  //////////////////////////////////
  // Create the OrbitalSet object
  //////////////////////////////////
  if (HaveLocalizedOrbs)
    OrbitalSet = new EinsplineSetLocal;
#ifdef QMC_CUDA
  else
    if (AtomicOrbitals.size() > 0)
    {
      if (UseRealOrbitals)
        OrbitalSet = new EinsplineSetHybrid<double>;
      else
        OrbitalSet = new EinsplineSetHybrid<complex<double> >;
    }
#endif
    else
    {
      if (UseRealOrbitals)
        OrbitalSet = new EinsplineSetExtended<double>;
      else
        OrbitalSet = new EinsplineSetExtended<complex<double> >;
    }
  //set the internal parameters
  setTiling(OrbitalSet,numOrbs);
  if (HaveLocalizedOrbs)
  {
    EinsplineSetLocal *restrict orbitalSet =
      dynamic_cast<EinsplineSetLocal*>(OrbitalSet);
    #pragma omp critical(read_einspline_orbs)
    {
      if ((spinSet == LastSpinSet) && (numOrbs <= NumOrbitalsRead) && (!NewOcc) )
        CopyBands(numOrbs);
      else
      {
        // Now, figure out occupation for the bands and read them
        OccupyBands(spinSet, sortBands);
        ReadBands (spinSet, orbitalSet);
      }
    }
    // Now, store what we have read
    LastOrbitalSet = OrbitalSet;
    LastSpinSet = spinSet;
    NumOrbitalsRead = numOrbs;
  }
  else // Otherwise, use EinsplineSetExtended
  {
    mytimer.restart();
    bool use_single= (spo_prec == "single" || spo_prec == "float");
    if (UseRealOrbitals)
    {
      OccupyBands(spinSet, sortBands);
      //check if a matching BsplineReaderBase exists
      BsplineReaderBase* spline_reader=0;
      //if(TargetPtcl.Lattice.SuperCellEnum != SUPERCELL_BULK && truncate=="yes")
      if(truncate=="yes")
      {
        if(use_single)
        {
          if(TargetPtcl.Lattice.SuperCellEnum == SUPERCELL_OPEN)
            spline_reader= new SplineMixedAdoptorReader<SplineOpenAdoptor<float,double,3> >(this);
          else
            if(TargetPtcl.Lattice.SuperCellEnum == SUPERCELL_SLAB)
              spline_reader= new SplineMixedAdoptorReader<SplineMixedAdoptor<float,double,3> >(this);
            else
              spline_reader= new SplineAdoptorReader<SplineR2RAdoptor<float,double,3> >(this);
        }
        else
        {
          if(TargetPtcl.Lattice.SuperCellEnum == SUPERCELL_OPEN)
            spline_reader= new SplineMixedAdoptorReader<SplineOpenAdoptor<double,double,3> >(this);
          else
            if(TargetPtcl.Lattice.SuperCellEnum == SUPERCELL_SLAB)
              spline_reader= new SplineMixedAdoptorReader<SplineMixedAdoptor<double,double,3> >(this);
            else
              spline_reader= new SplineAdoptorReader<SplineR2RAdoptor<double,double,3> >(this);
        }
      }
      else
      {
        if(use_single)
          spline_reader= new SplineAdoptorReader<SplineR2RAdoptor<float,double,3> >(this);
      }
      if(spline_reader)
      {
        HasCoreOrbs=bcastSortBands(NumDistinctOrbitals,myComm->rank()==0);
        SPOSetBase* bspline_zd=spline_reader->create_spline_set(spinSet,OrbitalSet);
        delete spline_reader;
        if(bspline_zd)
          SPOSetMap[aset] = bspline_zd;
        return bspline_zd;
      }
      else
      {
        app_log() << ">>>> Creating EinsplineSetExtended<double> <<<< " << endl;
        EinsplineSetExtended<double> *restrict orbitalSet =
          dynamic_cast<EinsplineSetExtended<double>* > (OrbitalSet);
        if (Format == ESHDF)
          ReadBands_ESHDF(spinSet,orbitalSet);
        else
          ReadBands(spinSet, orbitalSet);
      }
    }
    else
    {
      OccupyBands(spinSet, sortBands);
      BsplineReaderBase* spline_reader=0;
      if(truncate == "yes")
      {
        app_log() << "  Truncated orbitals with multiple kpoints are not supported yet!" << endl;
      }
      if(use_single)
      {
#if defined(QMC_COMPLEX)
        spline_reader= new SplineAdoptorReader<SplineC2CPackedAdoptor<float,double,3> >(this);
#else
        spline_reader= new SplineAdoptorReader<SplineC2RPackedAdoptor<float,double,3> >(this);
#endif
      }
      if(spline_reader)
      {
        RotateBands_ESHDF(spinSet, dynamic_cast<EinsplineSetExtended<complex<double> >*>(OrbitalSet));
        HasCoreOrbs=bcastSortBands(NumDistinctOrbitals,myComm->rank()==0);
        SPOSetBase* bspline_zd=spline_reader->create_spline_set(spinSet,OrbitalSet);
        delete spline_reader;
        if(bspline_zd)
          SPOSetMap[aset] = bspline_zd;
        return bspline_zd;
      }
      else
      {
        EinsplineSetExtended<complex<double> > *restrict orbitalSet =
          dynamic_cast<EinsplineSetExtended<complex<double> >*>(OrbitalSet);
        if (Format == ESHDF)
          ReadBands_ESHDF(spinSet,orbitalSet);
        else
          ReadBands(spinSet, orbitalSet);
      }
    }
    app_log() <<  "TIMER  EinsplineSetBuilder::ReadBands " << mytimer.elapsed() << endl;
  }
#ifndef QMC_COMPLEX
  if (myComm->rank()==0 && OrbitalSet->MuffinTins.size() > 0)
  {
    FILE *fout  = fopen ("TestMuffins.dat", "w");
    Vector<double> phi(numOrbs), lapl(numOrbs);
    Vector<PosType> grad(numOrbs);
    ParticleSet P;
    P.R.resize(6);
    for (int i=0; i<P.R.size(); i++)
      P.R[i] = PosType (0.0, 0.0, 0.0);
    PosType N = 0.25*PrimCell.a(0) + 0.25*PrimCell.a(1) + 0.25*PrimCell.a(2);
    for (double x=-1.0; x<=1.0; x+=0.0000500113412)
    {
      // for (double x=-0.003; x<=0.003; x+=0.0000011329343481381) {
      P.R[0] = x * (PrimCell.a(0) + 0.914*PrimCell.a(1) +
                    0.781413*PrimCell.a(2));
      double r = std::sqrt(dot(P.R[0], P.R[0]));
      double rN = std::sqrt(dot(P.R[0]-N, P.R[0]-N));
      OrbitalSet->evaluate(P, 0, phi, grad, lapl);
      // OrbitalSet->evaluate(P, 0, phi);
      fprintf (fout, "%1.12e ", r*x/std::fabs(x));
      for (int j=0; j<numOrbs; j++)
      {
        double gmag = std::sqrt(dot(grad[j],grad[j]));
        fprintf (fout, "%16.12e ",
                 /*phi[j]*phi[j]**/(-5.0/r  -0.5*lapl[j]/phi[j]));
        // double E = -5.0/r -0.5*lapl[j]/phi[j];
        fprintf (fout, "%16.12e ", phi[j]);
        fprintf (fout, "%16.12e ", gmag);
      }
      fprintf (fout, "\n");
    }
    fclose(fout);
  }
#endif
  SPOSetMap[aset] = OrbitalSet;
  if (sourceName.size() && (ParticleSets.find(sourceName) == ParticleSets.end()))
  {
    app_log() << "  EinsplineSetBuilder creates a ParticleSet " << sourceName << endl;
    ParticleSet* ions=new ParticleSet;
    ions->Lattice=TargetPtcl.Lattice;
    ESHDFIonsParser ap(*ions,H5FileID,myComm);
    ap.put(XMLRoot);
    ap.expand(TileMatrix);
    ions->setName(sourceName);
    ParticleSets[sourceName]=ions;
    //overwrite the lattice and assign random
    if(TargetPtcl.Lattice.SuperCellEnum)
    {
      TargetPtcl.Lattice=ions->Lattice;
      makeUniformRandom(TargetPtcl.R);
      TargetPtcl.R.setUnit(PosUnit::LatticeUnit);
      TargetPtcl.convert2Cart(TargetPtcl.R);
      TargetPtcl.createSK();
    }
  }
#ifdef QMC_CUDA
  if (useGPU == "yes" || useGPU == "1")
  {
    app_log() << "Initializing GPU data structures.\n";
    OrbitalSet->initGPU();
  }
#endif
  return OrbitalSet;
}
Ejemplo n.º 9
0
void STWarp<T>::warpingIteration( const WarpingField<T> &warpField,
                                  const Video<T> &Bx, 
                                  const Video<T> &By, 
                                  const Video<T> &Bt, 
                                  const Video<T> &C, 
                                  WarpingField<T> &dWarpField) {

    int height     = dimensions[0];
    int width      = dimensions[1];
    int nFrames    = dimensions[2];
    // int nChannels  = dimensions[3];

    if(params.verbosity > 1) {
        fprintf(stderr, "    - preparing linear system...");
    }
    // Differentiate (u,v,w)+d(u,v,w) in x,y,t
    WarpingField<T> warpDX(warpField.size());
    WarpingField<T> warpDY(warpField.size());
    WarpingField<T> warpDT(warpField.size());
    WarpingField<T> warpTotal(warpField);
    warpTotal.add(dWarpField);

    VideoProcessing::dx(warpTotal,warpDX);
    VideoProcessing::dy(warpTotal,warpDY);
    VideoProcessing::dt(warpTotal,warpDT);

    // Compute smoothness cost
    Video<T> smoothCost(height, width, nFrames, 9);
    Video<T> lapl(warpField.size());
    computeSmoothCost(warpDX, warpDY, warpDT, warpField, smoothCost, lapl);

    computeOcclusion(warpTotal, C, occlusion);

    // Compute data cost
    Video<T> dataCost;
    
    if(params.useFeatures){
        dataCost = Video<T>(height,width,nFrames,2);
    }else{
        dataCost = Video<T>(height,width,nFrames,1);
    }
    computeDataCost(Bx, By, Bt, C, dWarpField, occlusion, dataCost);

    // Prepare system
    Video<T> CBx(height,width,nFrames,1);
    Video<T> CBy(height,width,nFrames,1);
    Video<T> CBt(height,width,nFrames,1);
    Video<T> Bx2(height,width,nFrames,1);
    Video<T> By2(height,width,nFrames,1);
    Video<T> Bt2(height,width,nFrames,1);
    Video<T> Bxy(height,width,nFrames,1);
    Video<T> Bxt(height,width,nFrames,1);
    Video<T> Byt(height,width,nFrames,1);
    prepareLinearSystem(Bx, By, Bt, C, lapl, dataCost, 
                        CBx, CBy, CBt, Bx2, By2, Bt2,
                        Bxy, Bxt, Byt);
    if(params.verbosity > 1) {
        fprintf(stderr, "done.\n");
    }

    if(params.verbosity > 1) {
        fprintf(stderr, "    - SOR...");
    }
    sor( dataCost, smoothCost, lapl,
         CBx, CBy, CBt, Bx2,
         By2, Bt2, Bxy, Bxt, Byt, dWarpField);
    if(params.verbosity > 1) {
        fprintf(stderr, "done.\n");
    }
}