Esempio n. 1
0
IFloat operator/(const IFloat &a,const IFloat &b)
{
    if (a.isempty or b.isempty)
        return IFloat::empty;

    if (a <= 0 and !(b <= 0))
        return -(-a / b);
    
    if (!(a <= 0) and b <= 0)
        return -(a / -b);

    if (a <= 0 and b <= 0)
        return -a / -b;

    if (!(b >= 0))
        return IFloat(-std::numeric_limits<Float>::infinity(),
                       std::numeric_limits<Float>::infinity());

    if (b == IFloat::zero)
        return IFloat::empty;

    ProtectRounding pr;

    Float r; // For rounding

    if (a >= 0)
    {
        r = a.a / -b.b; // For rounding
        return IFloat(-r, a.b / b.a);
    }

    r = a.a / -b.a; // For rounding
    return IFloat(-r, a.b / b.a);
}
Esempio n. 2
0
IFloat cos(const IFloat &ifloat)
{
    if (ifloat.isempty)
        return ifloat;

    IFloat range = fmod(ifloat, IFloat::twice_pi);

    if (width(range) >= lower(IFloat::twice_pi))
        return IFloat(-1, 1);

    bool up;
    bool set = false;

    if (ifloat.contains_zero())
    {
        set = true;
        up = true;
    }

    if (range.contains(IFloat::pi) != false)
    {
        if (set)
            return IFloat(-1, 1);

        set = true;
        up = false;
    }

    if (range.contains(IFloat::twice_pi) != false)
    {
        if (set)
            return IFloat(-1, 1);

        set = true;
        up = true;
    }

    /* This can be more precise */
    if (range.contains(IFloat::pi + IFloat::twice_pi) != false)
        return IFloat(-1, 1);

    UnprotectRounding upr;

    if (!set)
    {
        if (upper(range) < lower(IFloat::pi))
            return IFloat(cos_rd(upper(range)), cos_ru(lower(range)));

        if (upper(range) < lower(IFloat::twice_pi))
            return IFloat(cos_rd(lower(range)), cos_ru(upper(range)));

        return IFloat(cos_rd(upper(range)), cos_ru(lower(range)));
    }

    /* Probably this part can be more fast, whit only a cosine */
    if (up)
        return IFloat(std::min(cos_rd(lower(range)), cos_rd(upper(range))), 1);

    return IFloat(-1, std::max(cos_ru(lower(range)), cos_ru(upper(range))));
}
Esempio n. 3
0
//------------------------------------------------------------------
void DiracOpWilson::MatPcDag(Vector *out, Vector *in) {
  wilson_mdag((IFloat *)out, 
	      (IFloat *)gauge_field, 
	      (IFloat *)in, 
	      IFloat(kappa),
	      (Wilson *)wilson_lib_arg);
}
Esempio n. 4
0
//------------------------------------------------------------------
// MatPcDagMatPc :
// MatPcDagMatPc is the fermion matrix that appears in the HMC 
// evolution. It is a Hermitian matrix where M is
// the even/odd preconditioned Dirac Operator matrix.        
// MatPcDagMatPc connects only odd-->odd sites.
// The in, out fields are defined on the odd checkerboard.
// If dot_prd is not 0 then the dot product (on node)
// <out, in> = <MatPcDagMatPc*in, in> is returned in dot_prd.
//------------------------------------------------------------------
void DiracOpWilson::MatPcDagMatPc(Vector *out, 
					 Vector *in, 
					 Float *dot_prd){
  wilson_mdagm((IFloat *)out, 
	       (IFloat *)gauge_field, 
	       (IFloat *)in, 
	       (IFloat *)dot_prd,
	       IFloat(kappa),
	       (Wilson *)wilson_lib_arg);
}
Esempio n. 5
0
IFloat multiplicativeInverse(const IFloat &ifloat)
{
    if (ifloat.isempty)
        return IFloat::empty;

    if (ifloat <= 0)
        return -multiplicativeInverse(-ifloat);

    if (!(ifloat >= 0))
        return IFloat(-std::numeric_limits<Float>::infinity(),
                       std::numeric_limits<Float>::infinity());

    ProtectRounding pr;

    Float r; // For rounding

    r = 1 / -ifloat.b; // For rounding
    return IFloat(-r, 1 / ifloat.a);
}
Esempio n. 6
0
//!< evolve method evolves the gauge field due to the momentum
void AlgMomentum::evolve(Float dt, int steps) 
{
  const char *fname = "evolve()";
  Float dtime = -dclock();

  Lattice &lat = LatticeFactory::Create(F_CLASS_NONE, G_CLASS_NONE);
  for (int i=0; i<steps; i++) lat.EvolveGfield(mom, dt);
  lat.MdTimeInc(dt*steps);
  VRB.Flow(cname,fname,"%s%f\n", md_time_str, IFloat(lat.MdTime()));
  LatticeFactory::Destroy();

  dtime += dclock();
  print_flops(cname, fname, 1968. * 4. * GJP.VolNodeSites() * steps, dtime);
}
Esempio n. 7
0
//!< Heat Bath for the conjugate momentum
void AlgMomentum::heatbath() {

  const char *fname = "heatbath()";
  Float dtime = -dclock();

  Lattice &lat = LatticeFactory::Create(F_CLASS_NONE, G_CLASS_NONE);
  lat.RandGaussAntiHermMatrix(mom, 1.0);

  //!< reset MD time in Lattice (a momentum refresh means a new trajectory)
  lat.MdTime(0.0);
  VRB.Flow(cname,fname,"%s%f\n", md_time_str, IFloat(lat.MdTime()));
      
  LatticeFactory::Destroy();
  
  dtime += dclock();
  print_flops(cname, fname, 0, dtime);
}
Esempio n. 8
0
IFloat tanpi (const IFloat &ifloat)
{
    if (ifloat.isempty)
        return ifloat;

    IFloat range = fmod(ifloat, 1);

    if (width(range) >= lower(1))
        return IFloat::all;

    bool set;

    if (range.contains(1 / 2.) != false)
        set = true;

    if (range.contains(3 / 2.) != false)
    {
        if (set)
            return IFloat::all;
        
        set = true;
    }

    if (range.contains(5 / 2.) != false)
        return IFloat::all;

    UnprotectRounding upr;

    if (!set)
        return IFloat(tanpi_rd(lower(range)), tanpi_ru(upper(range)));

    /* This can be more precise if it is checked if a peak is only touched but
     * it is not passed. In this case we have only an extreme that contains an
     * infinite
     */
    return IFloat::all;
}
Esempio n. 9
0
IFloat operator*(const IFloat &a,const IFloat &b)
{
    if (a.isempty or b.isempty)
        return IFloat::empty;

    ProtectRounding pr;

    bool posa = a.b >= 0,
         posb = b.b >= 0,
         unka = a.a <  0 and a.b > 0,
         unkb = b.a <  0 and b.b > 0;

    Float r; // For rounding
    if (posa)
    {
        if (posb)
        {
            r = a.a * -b.a; // For rounding
            return IFloat(-r, a.b * b.b);
        }

        if (unkb)
        {
            r = a.b * -b.a; // For rounding
            return IFloat(-r, a.b * b.b);
        }

        r = a.b * -b.a; // For rounding
        return IFloat(-r, a.a * b.b);
    }

    if (unka)
    {
        if (posb)
        {
            r = a.a * -b.b; // For rounding
            return IFloat(-r, a.b * b.b);
        }

        if (unkb)
        {
            r = std::max(a.a * -b.b, a.b * -b.a); // For rounding
            return IFloat(-r, std::max(a.a * b.a, a.b * b.b));
        }

        r = a.b * -b.b; // For rounding
        return IFloat(-r, a.a * b.b);
    }

    if (posb)
    {
        r = a.a * -b.b; // For rounding
        return IFloat(-r, a.b * b.a);
    }

    if (unkb)
    {
        r = a.b * - b.b; // For rounding
        return IFloat(-r, a.b * b.a);
    }

    r = a.b * b.b; // For rounding
    return IFloat(-r, a.a * b.a);
}
Esempio n. 10
0
 inline IFloat e() const
 {
     return IFloat(-(Float)(-exp_rd(1)), exp_ru(1));
 }
Esempio n. 11
0
 /* The twice_pi is a working stub: that is not with the best
  * precision
  */
 inline IFloat twice_pi() const
 {
     return IFloat(-(Float)(-2 * acos_rd(-1)), 2 * acos_ru(-1));
 }
Esempio n. 12
0
 inline IFloat pi() const
 {
     return IFloat(-(Float)(-acos_rd(-1)), acos_ru(-1));
 }
Esempio n. 13
0
 inline IFloat half_pi() const
 {
     return IFloat(-(Float)(-acos_rd(0)), acos_ru(0));
 }
Esempio n. 14
0
std::pair<IFloat, IFloat> splitdiv(const IFloat &a, const IFloat &b)
{
    std::pair<IFloat, IFloat> r;

    if (a.isempty or b.isempty)
        return r;

    IFloat tf;

    if (a <= 0 and !(b <= 0))
    {
        r = splitdiv(-a, b);
        tf       =  r.first;
        r.first  = -r.second;
        r.second = -tf;
        return r;
    }
    
    if (!(a <= 0) and b <= 0)
    {
        r = splitdiv(a, -b);
        tf       =  r.first;
        r.first  = -r.second;
        r.second = -tf;
        return r;
    }

    if (a <= 0 and b <= 0)
        return splitdiv(-a, -b);

    if (b == IFloat::zero)
        return r;

    ProtectRounding pr;

    Float t;

    if (b >= 0)
    {
        if (a >= 0)
        {
            t = a.a / -b.b; // For rounding
            r.first = IFloat(-t, a.b / b.a);
            return r;
        }

        t = a.a / -b.a; // For rounding
        r.first = IFloat(-t, a.b / b.a);
        return r;
    }

    Float inf = std::numeric_limits<Float>::infinity();

    if (a >= 0)
    {
        r.first = IFloat(-inf, a.a / b.a);

        t = a.a / -b.b; // For rounding
        r.second = IFloat(-t, inf);

        return r;
    }

    r.first = IFloat(-inf, std::max(a.a / b.b, a.b / b.a));

    t = std::max(a.a / -b.a, a.b / -b.b); // For rounding
    r.second = IFloat(-t, inf);

    return r;
}
Esempio n. 15
0
        {
            return IFloat(-(Float)(-2 * acos_rd(-1)), 2 * acos_ru(-1));
        }

        inline IFloat e() const
        {
            return IFloat(-(Float)(-exp_rd(1)), exp_ru(1));
        }
};
static setupConstants constants;

bool FloatingStatus::prot        = true;
bool FloatingStatus::first_init  = true;
int  FloatingStatus::stdrounding = fegetround();

const IFloat IFloat::empty = IFloat();
const IFloat IFloat::zero  = 0;
const IFloat IFloat::one   = 1;

const IFloat IFloat::pos = IFloat(0, std::numeric_limits<Float>::infinity());
const IFloat IFloat::neg = IFloat(-std::numeric_limits<Float>::infinity(), -0.);
const IFloat IFloat::all = IFloat(-std::numeric_limits<Float>::infinity(),
                                   std::numeric_limits<Float>::infinity());

const IFloat IFloat::half_pi  = constants.half_pi();
const IFloat IFloat::pi       = constants.pi();
const IFloat IFloat::twice_pi = constants.twice_pi();

const IFloat IFloat::e = constants.e();

FloatingStatus::FloatingStatus()
Esempio n. 16
0
//U saves the deflation space vectors. H=U^dag*A*U, invH=inv(H), def_len is the number of deflation space vectors
//set restart=0 will never restart, or else it will restart once when relative residule < restart
//return number of iterations to converge
//V returns the calculated eig vectors(length m, only use the first 2*nev), and M are the eigen values(length 2*nev). When we do deflation, we should only pick those has small eigen values to U
int DiracOp::InvEigCg(Vector *sol, Vector *src, Float *true_res, const int nev, const int m, Vector **V, const int vec_len, Float *M, float **U, Rcomplex *invH, const int def_len, const Float *restart, const int restart_len)
{
	
	char *fname = "InvEigCg(V*,V*,F,F*)";
	VRB.Func(cname,fname);

	if(nev>0 && m<=2*nev)ERR.General(cname,fname,"m should larger than 2*nev\n");

	int f_size_cb;     // Node checkerboard size of the fermion field
	// Set the node checkerboard size of the fermion field
	//------------------------------------------------------------------
	if(lat.Fclass() == F_CLASS_CLOVER) {
		f_size_cb = GJP.VolNodeSites() * lat.FsiteSize() / 2;
	 } else {
		f_size_cb = GJP.VolNodeSites() * lat.FsiteSize() / (lat.FchkbEvl()+1);
	 }

	if(vec_len!=f_size_cb)ERR.General(cname,fname,"vector length V does not match the length of solution and src vectors!\n");


	int iter=0; //Current number of CG iterations
	int max_iter=dirac_arg->max_num_iter; //max iteration number

	if (f_size_cb % GRAN != 0)ERR.General(cname,fname,"Field length %d is not a multiple of granularity %d\n", GRAN, f_size_cb);

	//calculate source norm
	Float src_norm_sq = src->NormSqNode(f_size_cb);
    DiracOpGlbSum(&src_norm_sq);

	VRB.Flow(cname,fname, "nev = %d, m= %d\n", nev, m);
	VRB.Flow(cname,fname, "Deflation length = %d \n", def_len);
	for(int i=0;i<restart_len;i++)VRB.Flow(cname,fname, "restart condition = %e\n", restart[i]);

	Float stp_cnd = src_norm_sq * dirac_arg->stop_rsd * dirac_arg->stop_rsd;
	VRB.Flow(cname,fname, "stp_cnd =%e\n", IFloat(stp_cnd));

	// Allocate memory for the solution/residual field.
	//------------------------------------------------------------------
	IFloat *X = (IFloat *) fmalloc(cname,fname,"X",2*f_size_cb * sizeof(Float));

	// Allocate memory for the direction vector dir.
	//------------------------------------------------------------------
	Vector *dir;
	if(GJP.VolNodeSites() >4096) 
		dir = (Vector *) smalloc(cname,fname,"dir",f_size_cb * sizeof(Float));
	else dir = (Vector *) fmalloc(cname,fname,"dir",f_size_cb * sizeof(Float));

	// Allocate mem. for the result vector of matrix multiplication mmp.
	//------------------------------------------------------------------
	Vector *mmp;
	if(GJP.VolNodeSites() >4096) 
		mmp = (Vector *) smalloc(cname,fname,"mmp",f_size_cb * sizeof(Float));
	else mmp = (Vector *) fmalloc(cname,fname,"mmp",f_size_cb * sizeof(Float));

	Vector *mmp_prev=NULL;
	//eigCG part
	if(nev>0)
	{
		mmp_prev = (Vector *) smalloc(cname,fname,"mmp",f_size_cb * sizeof(Float));
	}
	//eigCG end

	Rcomplex *Ub=NULL,*invHUb=NULL;
	if(def_len>0)
	{
		Ub=(Rcomplex *) fmalloc(cname,fname,"Ub",2*def_len*sizeof(Float));
		invHUb=(Rcomplex *) fmalloc(cname,fname,"invHUb",2*def_len*sizeof(Float));
	}


	//initial solution guess
	sol->VecZero(f_size_cb);
	if(def_len>0)
	{
		//sol = U*invH*U^dag*src;
		for(int ii=0;ii<def_len;ii++)
		{
			//in CPS, dot(a,b)=a^* * b
			//Ub[ii]=U[ii]->CompDotProductGlbSum(src,f_size_cb);
			//NOTICE!!! Should be improved to do a single glb_sum for all Ub
			//use function CompDotProductNode, after loop, do glb_sum(Ub,2*def_len)
			Float c_r, c_i;
			compDotProduct<float, Float>(&c_r, &c_i, U[ii], (Float *)src,f_size_cb);
			glb_sum_five(&c_r);
			glb_sum_five(&c_i);
			Ub[ii]=Complex(c_r,c_i);
		}
		int index=0;
		for(int ii=0;ii<def_len;ii++)
		{
			invHUb[ii]=0.0;
			for(int jj=0;jj<def_len;jj++)
				invHUb[ii]+=invH[index++]*Ub[jj];
		}
		for(int ii=0;ii<def_len;ii++)
		{
			//sol->CTimesV1PlusV2(invHUb[ii],U[ii],sol,f_size_cb);
			cTimesV1PlusV2<Float,float,Float>((Float *)sol, real(invHUb[ii]), imag(invHUb[ii]), U[ii],(Float *)sol,f_size_cb);

		}
#ifdef TEST
		for(int ii=0;ii<def_len;ii++)
		{
			Float xx=U[ii]->NormSqNode(f_size_cb);
			DiracOpGlbSum(&xx);
			std::cout<<"U[ii] norm = "<<xx<<std::endl;
		}
		std::cout<<"Ub vector"<<std::endl;
		for(int i=0;i<def_len;i++)
			std::cout<<Ub[i].real()<<'\t';
		std::cout<<std::endl;
		std::cout<<"inv(H) matrix:"<<std::endl;
		for(int i=0;i<def_len;i++)
		{
			for(int j=0;j<def_len;j++)
				std::cout<<invH[i*def_len+j].real()<<'\t';
			std::cout<<std::endl;
		}
		Float xx=sol->NormSqNode(f_size_cb);
		DiracOpGlbSum(&xx);
		std::cout<<"inital guess norm = "<<xx<<std::endl;
#endif
	}

	//dir = res(part of X vector) = src - MatPcDagMatPc * sol
    MatPcDagMatPc(mmp, sol);
    dir->CopyVec(src, f_size_cb);
    dir->VecMinusEquVec(mmp, f_size_cb);

	//aux pointers
    IFloat *Fsol = (IFloat*)sol;
    IFloat *Fdir = (IFloat*)dir;
    IFloat *Fmmp = (IFloat*)mmp;

    // Interleave solution and residual
    IFloat *Xptr = X;
    for (int j=0; j<f_size_cb/GRAN;j++) {
      for (int i=0; i<GRAN; i++) *Xptr++ = *(Fsol+j*GRAN+i); //initial solution
      for (int i=0; i<GRAN; i++) *Xptr++ = *(Fdir+j*GRAN+i); //residule
    }

	Float res_norm_sq_prv,res_norm_sq_cur;
	Float alpha,beta,pAp;

    res_norm_sq_cur = dir->NormSqNode(f_size_cb);
    DiracOpGlbSum(&res_norm_sq_cur);
	VRB.Flow(cname,fname, "|res[initial]|^2 = %e\n", IFloat(res_norm_sq_cur));

	sync();

	//eigCG part 
	int i_eig=0;
	alpha=1.0; //avoid 0/0 at the first eig iteration 
	beta=0.0;
	Float alpha_old;
	Float beta_old;
	Float *T=NULL;
	Float *Tt=NULL;
	Float *Q=NULL;
	Float *QZ=NULL;
	Float *H=NULL;
	Float *Y=NULL;
	Float *EIG=NULL;
	int *INDEX=NULL;
	if(nev>0)
	{
		T=(Float *) fmalloc(cname,fname,"T", m*(m+1)/2*sizeof(Float));// mxm real symetric matrix, lower triangular only
		Tt=(Float *) fmalloc(cname,fname,"T", m*(m+1)/2*sizeof(Float));// (m)x(m) real symetric matrix, lower triangular only
		//T(i,j) = T[ i(i+1)/2+j]
		//NOTICE!! T is actually very sparse matrix, so it can be speed up by a very large factor
		for(int i=0;i<m*(m+1)/2;i++)T[i]=0.0;
		Y=(Float *) fmalloc(cname,fname,"Y", m*m*sizeof(Float));//m*m real matrix, to store eigen vectors when needed
		Q=(Float *) fmalloc(cname,fname,"Q", m*2*nev*sizeof(Float));//m*(2nev) real matrix
		QZ=(Float *) fmalloc(cname,fname,"QZ", m*2*nev*sizeof(Float));//m*(2nev) real matrix
		H=(Float *) fmalloc(cname,fname,"H", 2*nev*(2*nev+1)/2*sizeof(Float));//(2nev)*(2nev) real matrix, symetric, lower part
		EIG=(Float *)fmalloc(cname,fname,"EIG",m*sizeof(Float));

		INDEX=(int *)fmalloc(cname,fname,"INDEX",nev*sizeof(int));//this vector is not necessay if we have a good eig solver for nev low
	}
	//
	//eigCG part end

	int restarted=0;
	Float *restartcond;
	if(restart_len>0)
	{
		restartcond=(Float *)fmalloc(cname,fname,"restartcond",restart_len*sizeof(Float));
		for(int i=0;i<restart_len;i++)restartcond[i]=restart[i]*restart[i]*src_norm_sq;
	}
	Float eigTotal=0.0;
	Float eigProj=0.0;
	Float defTime=0.0;
	Float total_time=0.0;
	total_time-=dclock();
    Float linalg_flops = 0;
    Float eigProj_flops = 0;
	Float linalg_time=0;
	CGflops=0;
	for(iter=0;iter<max_iter;iter++)
	{
		//eigCG part
		if(nev>0 && i_eig==m)mmp_prev->CopyVec(mmp,f_size_cb);
		//eig CG end
		
		res_norm_sq_prv = res_norm_sq_cur;
		MatPcDagMatPc(mmp,dir,&pAp);
		DiracOpGlbSum(&pAp);
		if(pAp==0)break;

		if(nev>0)
		{
			eigTotal-=dclock();
			int nev2=2*nev;
			//T(i_eig-1,i_eig-1)
			if(i_eig!=0)T[(i_eig-1)*i_eig/2+i_eig-1]=1.0/alpha+beta_old/alpha_old;
			if(i_eig==m)
			{
				//Yb need lowest nev eigen vector of T(m-1)
				//Y need lowest nev eigen vector of T(m)
#ifdef TEST
				std::cout<<" T matrix: "<<std::endl;
				for(int i=0;i<m;i++)
				{
					for(int j=0;j<m;j++)
					{
						if(i>=j)std::cout<<T[i*(i+1)/2+j]<<'\t';
						else std::cout<<T[j*(j+1)/2+i]<<'\t';
					}
					std::cout<<std::endl;
				}
#endif
				for(int i=0;i<m*(m-1)/2;i++)Tt[i]=T[i];
				eigen_solver(Tt,Y,EIG,m-1);//NOTICE, this is NOT needed, only need to calculate the lowest nev, not all m >2*nev
				//NOTICE, Y transpose is the eigen vectors.
				min_eig_index(INDEX,nev,EIG,m-1);
				//Q(nev:2nev-1)=Yb; with Yb last row zero
				for(int i=nev;i<2*nev;i++)
				{
					//Y transpose is eigen vector
					//for(int j=0;j<m-1;j++)Q[j*2*nev+i]=Y[j+INDEX[i-nev]*(m-1)];
					//Y is eigen vector
					for(int j=0;j<m-1;j++)Q[j*2*nev+i]=Y[j*(m-1)+INDEX[i-nev]];
					Q[(m-1)*2*nev+i]=0.0;
				}
				for(int i=0;i<m*(m+1)/2;i++)Tt[i]=T[i];
				eigen_solver(Tt,Y,EIG,m);//NOTICE, this is NOT needed, only need to calculate the lowest nev, not all m >2*nev
				min_eig_index(INDEX,nev,EIG,m);
				//Q(0:nev-1)=Yb; 
				for(int i=0;i<nev;i++)
				{
					//for(int j=0;j<m;j++)Q[j*2*nev+i]=Y[j+INDEX[i]*m];
					//Y is eigen vector
					for(int j=0;j<m;j++)Q[j*2*nev+i]=Y[j*m+INDEX[i]];
				}
				
				//Q=orth([Y,Yb]); with YB last row zero
				//rank(Q) may be smaller than 2*nev. remove these
				//should be optimized. maybe save row first for Q
				int rank=nev;
				for(int i=nev;i<2*nev;i++)
				{
					for(int j=0;j<rank;j++)
					{
						Float xy=0.0;
						for(int k=0;k<m;k++)xy+=Q[k*2*nev+i]*Q[k*2*nev+j];
						for(int k=0;k<m;k++)Q[k*2*nev+i]-=xy*Q[k*2*nev+j];
					}
					//normalize
					Float xx=0.0;
					for(int k=0;k<m;k++)xx+=Q[k*2*nev+i]*Q[k*2*nev+i];
					if(xx>1e-16)
					{
						xx=1.0/sqrt(xx);
						for(int k=0;k<m;k++)Q[k*2*nev+rank]=xx*Q[k*2*nev+i];
						rank++;
					}
				}
				VRB.Flow(cname,fname,"Rank of Q = %d\n",rank);
				//H=Q' * T * Q
				for(int i=0;i<rank;i++)
					for(int j=0;j<=i;j++)
					{
						H[i*(i+1)/2+j]=0.0;
						for(int l=0;l<m;l++)
							for(int k=0;k<m;k++)
							{
								if(k<=l)H[i*(i+1)/2+j]+=Q[l*nev2+i]*T[l*(l+1)/2+k]*Q[k*nev2+j];
								else H[i*(i+1)/2+j]+=Q[l*nev2+i]*T[k*(k+1)/2+l]*Q[k*nev2+j];
							}
					}

#ifdef TEST
				std::cout<<"H matrix:"<<std::endl;
				for(int i=0;i<rank;i++)
				{
					for(int j=0;j<rank;j++)
					{
						if(i>=j)std::cout<<H[i*(i+1)/2+j]<<'\t';
						else std::cout<<H[j*(j+1)/2+i]<<'\t';
					}
					std::cout<<std::endl;
				}
#endif
				//[Z,M]=eig(H)
				eigen_solver(H,Y,M,rank);
				for(int i=rank;i<2*nev;i++)M[i]=0.0;//set M[i>=rank] to zero
				//NOtice, Y transpose is eigenvectos.
				for(int i=0;i<rank;i++)VRB.Flow(cname,fname,"eig %d : %e \n",i,M[i]);
				
				//V=V*(Q*Z)
				//1.QZ=Q*Z
				//transpoze QZ here to speed up the later calculation
				for(int j=0;j<rank;j++)
				for(int i=0;i<m;i++)
				{
					QZ[i+m*j]=0.0;
					//for(int k=0;k<rank;k++)QZ[i+m*j]+=Q[i*nev2+k]*Y[j*rank+k];
					for(int k=0;k<rank;k++)QZ[i+m*j]+=Q[i*nev2+k]*Y[j+k*rank];
				}

#ifdef TEST
				std::cout<<"QZ matrix:"<<std::endl;
				for(int i=0;i<m;i++)
				{
					for(int j=0;j<rank;j++)
					{
						std::cout<<QZ[i+j*m]<<'\t';
					}
					std::cout<<std::endl;
				}
#endif
				eigProj-=dclock();
				//2.V=V*QZ need to be implement very efficiently
				//. The way we save QZ is transpozed to column first 
				eigcg_vec_mult(V,m,QZ,rank,f_size_cb);
				eigProj+=dclock();
				eigProj_flops += 2*f_size_cb*m*rank;

				//T=M
				for(int i=0;i<m*(m+1)/2;i++)T[i]=0.0;
				for(int i=0;i<rank;i++)T[i*(i+1)/2+i]=M[i];
				i_eig=rank;
				//w=mmp-beta*mmp_prev
				mmp_prev->FTimesV1PlusV2(-beta,mmp_prev,mmp,f_size_cb);
				//T(i_eig+1,1:i_eig)=w^dag * V/sqrt(rsq) 
				//T is symmetric and REAL !! TESTED
				for(int ii=0;ii<i_eig;ii++)
				{
					//T(i_eig,ii)
					T[i_eig*(i_eig+1)/2+ii]=mmp_prev->ReDotProductGlbSum(V[ii],f_size_cb); //again these global sum can be donce by once to speed up
					T[i_eig*(i_eig+1)/2+ii]/=sqrt(res_norm_sq_cur);
				}
			}
			else
			{
				if(i_eig!=0)
				{
					//T(i_eig,i_eig-1)
					T[i_eig*(i_eig+1)/2+i_eig-1]=-sqrt(beta)/alpha;
				}
			}
			//V[i_eig]=r/sqrt(rsq);
			Float *vptr = (Float *)V[i_eig];
			invcg_copy_rnorm(vptr, res_norm_sq_cur, X+GRAN, f_size_cb/GRAN);
			i_eig++;
			eigTotal+=dclock();
		}
		
		alpha_old=alpha; //eigCG part
		alpha = -res_norm_sq_prv/pAp;
		// res = - alpha * (MatPcDagMatPc * dir) + res;
		// res_norm_sq_cur = res * res

		//test
		//Float test=((Vector *)X)->NormSqGlbSum(f_size_cb*2);
		//VRB.Flow(cname,fname,"X norm=%e \n",test);
		//test=mmp->NormSqGlbSum(f_size_cb);
		//VRB.Flow(cname,fname,"mmp norm=%e \n",test);

		linalg_time-=dclock();
		invcg_r_norm(X+GRAN, &alpha, Fmmp, X+GRAN, f_size_cb/GRAN, &res_norm_sq_cur);
		linalg_time+=dclock();
		DiracOpGlbSum(&res_norm_sq_cur);
		linalg_flops+=f_size_cb*4;
		alpha = -alpha;
		beta_old=beta; //eigCG part
		beta = res_norm_sq_cur / res_norm_sq_prv;
		//VRB.Flow(cname,fname,"a=%e, b=%e, pAp=%e \n",alpha,beta,pAp);
		// sol = alpha * dir + sol;
		// dir = beta * dir + res;
		linalg_time-=dclock();
		invcg_xp_update(X, Fdir, &alpha, &beta, Fdir, X, f_size_cb/GRAN);
		linalg_time+=dclock();
		linalg_flops+=f_size_cb*4;
		
		//consider restarting the init-CG once
		if(restarted<restart_len && def_len>0 && res_norm_sq_cur<restartcond[restarted])
		{
			defTime-=dclock();
			restarted++;
			VRB.Flow(cname,fname,"eigCG restarted at res_norm_sq_cur= %e\n",res_norm_sq_cur);
			//reuse dir vector as the initial guess vector of A*e=r ,with dir=e=U*invH*U^dag*r
			//reuse mmp for r in X
			//use sol for x in X
			Xptr = X;
			for (int j=0; j<f_size_cb/GRAN; j++) {
			  for (int i=0; i<GRAN; i++) *(Fsol+j*GRAN+i)=*Xptr++; // solution
			  for (int i=0; i<GRAN; i++) *(Fmmp+j*GRAN+i)=*Xptr++; // residule
			}

			for(int ii=0;ii<def_len;ii++)
			{
				//Ub[ii]=U[ii]->CompDotProductGlbSum(mmp,f_size_cb);
				//NOTICE!!! Should be improved to a single glb_sum for all Ub
				Float c_r, c_i;
				compDotProduct<float, Float>(&c_r, &c_i, U[ii], (Float *)mmp,f_size_cb);
				glb_sum_five(&c_r);
				glb_sum_five(&c_i);
				Ub[ii]=Complex(c_r,c_i);
			}
			int index=0;
			for(int ii=0;ii<def_len;ii++)
			{
				invHUb[ii]=0.0;
				for(int jj=0;jj<def_len;jj++)
					invHUb[ii]+=invH[index++]*Ub[jj];
			}
			dir->VecZero(f_size_cb);
			for(int ii=0;ii<def_len;ii++)
			{
				//dir->CTimesV1PlusV2(invHUb[ii],U[ii],dir,f_size_cb);
				cTimesV1PlusV2<Float,float,Float>((Float *)dir, real(invHUb[ii]), imag(invHUb[ii]), U[ii],(Float *)dir,f_size_cb);
			}

			sol->VecAddEquVec(dir,f_size_cb); //get new sol
			//reuse the first half of X to save M^dag*M*e
			Vector *PAe=(Vector *)X;

			MatPcDagMatPc(PAe,dir);
			mmp->VecMinusEquVec(PAe,f_size_cb); //get new res

			res_norm_sq_cur = mmp->NormSqNode(f_size_cb);
			DiracOpGlbSum(&res_norm_sq_cur);

			dir->CopyVec(mmp,f_size_cb);

			Xptr = X;
			for (int j=0; j<f_size_cb/GRAN;j++) {
			  for (int i=0; i<GRAN; i++) *Xptr++ = *(Fsol+j*GRAN+i); //new initial solution
			  for (int i=0; i<GRAN; i++) *Xptr++ = *(Fmmp+j*GRAN+i); //new residule
			}

			defTime+=dclock();
		}
		
		VRB.Flow(cname,fname, "|res[%d]|^2 = %e\n", iter, IFloat(res_norm_sq_cur));
		if(res_norm_sq_cur <= stp_cnd) break;
	}
	total_time+=dclock();
	
	VRB.Result(cname,fname,"1. Time on CG : %e  seconds in %e flops\n", total_time-eigTotal-defTime,((Float)CGflops+linalg_flops)/(total_time-eigTotal-defTime));
	VRB.Result(cname,fname,"1.x CG linear algebra : %e flops / %e seconds = %e flops\n", linalg_flops, linalg_time, linalg_flops/linalg_time);
	VRB.Result(cname,fname,"2. Total Time on eig part: %e seconds \n", eigTotal);
	if(nev>0)VRB.Result(cname,fname,"2.x projection part of eig part : %e flops / %e seconds = %e flops\n", eigProj_flops, eigProj, eigProj_flops/eigProj);
	if(def_len>0)VRB.Result(cname,fname,"3. deflation(restart) time : %e seconds\n", defTime);


	if(iter == max_iter)
		VRB.Warn(cname,fname, "CG reached max iterations = %d. |res|^2 = %e\n", iter+1, IFloat(res_norm_sq_cur) );

    Xptr = X-GRAN;
    for (int j=0; j<f_size_cb; j++) {
      if (j%GRAN==0) Xptr += GRAN;
      *(Fsol++) = *(Xptr++);
    }
    MatPcDagMatPc(mmp, sol);
    dir->CopyVec(src, f_size_cb);
    dir->VecMinusEquVec(mmp, f_size_cb);
    res_norm_sq_cur = dir->NormSqNode(f_size_cb);
    DiracOpGlbSum(&res_norm_sq_cur);
	*true_res = res_norm_sq_cur/src_norm_sq;
	*true_res = sqrt(*true_res);
    VRB.Result(cname,fname, "True |res| / |src| = %e, iter = %d\n", IFloat(*true_res), iter);

	// Free memory
	sfree(cname,fname, "mmp", mmp);
	sfree(cname,fname, "dir", dir);
	sfree(cname,fname, "X", X);
	if(def_len>0)
	{
		sfree(cname,fname, "Ub", Ub);
		sfree(cname,fname, "invHUb", invHUb);
	}
	//eigCG part
	if(nev>0)
	{
		sfree(cname,fname,"mmp_prev",mmp_prev);
	}
	//eigCG part end
	if(restart_len>0)sfree(cname,fname,"restartcond",restartcond);

	sync();

	return iter;
}