示例#1
0
/* Computes the staple :
                 mu
              +-------+
        nu    |       |
              |       |
              X       X
  Where the mu link can be any su3_matrix. The result is saved in staple.
  if staple==NULL then the result is not saved.
  It also adds the computed staple to the fatlink[mu] with weight coef.
*/
static void
compute_gen_staple(QDP_ColorMatrix *staple, int mu, int nu,
		   QDP_ColorMatrix *link, double dcoef,
		   QDP_ColorMatrix *gauge[], QDP_ColorMatrix *fl[])
{
  QLA_Real coef = dcoef;
  QDP_ColorMatrix *ts0, *ts1;
  QDP_ColorMatrix *tmat1, *tmat2;
  QDP_ColorMatrix *tempmat;

  ts0 = QDP_create_M();
  ts1 = QDP_create_M();
  tmat1 = QDP_create_M();
  tmat2 = QDP_create_M();
  tempmat = QDP_create_M();

  /* Upper staple */
  QDP_M_eq_sM(ts0, link, QDP_neighbor[nu], QDP_forward, QDP_all);
  QDP_M_eq_sM(ts1, gauge[nu], QDP_neighbor[mu], QDP_forward, QDP_all);

  if(staple!=NULL) {  /* Save the staple */
    QDP_M_eq_M_times_Ma(tmat1, ts0, ts1, QDP_all);
    QDP_M_eq_M_times_M(staple, gauge[nu], tmat1, QDP_all);
  } else {  /* No need to save the staple. Add it to the fatlinks */
    QDP_M_eq_M_times_Ma(tmat1, ts0, ts1, QDP_all);
    QDP_M_eq_M_times_M(tmat2, gauge[nu], tmat1, QDP_all);
    QDP_M_peq_r_times_M(fl[mu], &coef, tmat2, QDP_all);
  }

  /* lower staple */
  QDP_M_eq_sM(ts0, gauge[nu], QDP_neighbor[mu], QDP_forward, QDP_all);
  QDP_M_eq_Ma_times_M(tmat1, gauge[nu], link, QDP_all);
  QDP_M_eq_M_times_M(tempmat, tmat1, ts0, QDP_all);
  QDP_M_eq_sM(ts0, tempmat, QDP_neighbor[nu], QDP_backward, QDP_all);

  if(staple!=NULL) { /* Save the staple */
    QDP_M_peq_M(staple, ts0, QDP_all);
    QDP_M_peq_r_times_M(fl[mu], &coef, staple, QDP_all);
  } else {  /* No need to save the staple. Add it to the fatlinks */
    QDP_M_peq_r_times_M(fl[mu], &coef, ts0, QDP_all);
  }

  QDP_destroy_M(ts0);
  QDP_destroy_M(ts1);
  QDP_destroy_M(tmat1);
  QDP_destroy_M(tmat2);
  QDP_destroy_M(tempmat);
} /* compute_gen_staple */
void 
QOP_hisq_force_multi_fnmat2_qdp(QOP_info_t *info,  
				QOP_FermionLinksHisq *flh,
				QDP_ColorMatrix *force[], 
				QOP_hisq_coeffs_t *hisq_coeff,
				REAL *residues,
				QDP_ColorVector *x[], 
				int *n_orders_naik)
{
#define NC QDP_get_nc(force[0])
  double dtime = QOP_time();

  QDP_ColorMatrix *deriv[4];
  for(int mu=0; mu<4; mu++) {
    deriv[mu] = QDP_create_M();
    QDP_M_eq_zero(deriv[mu], QDP_all);
  }
  QOP_hisq_deriv_multi_fnmat2_qdp(info, flh, deriv, hisq_coeff, residues, x, n_orders_naik);

  // contraction with the link in question should be done here,
  // after contributions from all levels of smearing are taken into account
  // Put antihermitian traceless part into momentum 
  // add force to momentum
  QDP_ColorMatrix *mtmp = QDP_create_M();
  for(int dir=0; dir<4; dir++) {
    QDP_M_eq_M_times_Ma(mtmp, flh->U_links[dir], deriv[dir], QDP_all);
    QDP_M_eq_antiherm_M(deriv[dir], mtmp, QDP_all);
    QDP_M_peq_M(force[dir], deriv[dir], QDP_all);
  }
  info->final_flop += (4.*(198+24+18))*QDP_sites_on_node; 

  QDP_destroy_M(mtmp);
  for(int mu=0; mu<4; mu++) {
    QDP_destroy_M(deriv[mu]);
  }

  info->final_sec = QOP_time() - dtime;
  //QOP_printf0("HISQ force flops = %g\n", info->final_flop);
#undef NC
}
示例#3
0
// topdir = 1..nd
// sidedir = -nd..nd
// toplinknum,sidelinknum = 0..nin-1
void
QOP_staples(QOP_info_t *info, int nout, int nin,
	    QDP_ColorMatrix *out[], QDP_ColorMatrix *in[],
	    int nstaples[], int *topdir[], int *sidedir[],
	    int *toplinknum[], int *sidelinknum[], QLA_Real *coef[])
{
#define NC QDP_get_nc(in[0])
  double dtime = QOP_time();
  double nflops = 0;
  int nd = QDP_ndim();
  QDP_ColorMatrix *ftmps[nin][nd], *t1, *t2, *bt2[nd];
  for(int i=0; i<nin; i++)
    for(int j=0; j<nd; j++)
      ftmps[i][j] = NULL;
  for(int i=0; i<nd; i++) bt2[i] = NULL;
  t1 = QDP_create_M();
  t2 = QDP_create_M();

  for(int io=0; io<nout; io++) {
    //QOP_printf0("%i: ns: %i\n", io, nstaples[io]);
    for(int s=0; s<nstaples[io]; s++) {
      QLA_Real c = coef[io][s];
      int tn = toplinknum[io][s];
      int sdir = sidedir[io][s];
      //QOP_printf0(" %i:  sdir: %i  c: %g\n", s, sdir, c);
      if(sdir==0) {
	if(c==1) {
	  QDP_M_peq_M(out[io], in[tn], QDP_all);
	  nflops += PEQM;
	} else {
	  QDP_M_peq_r_times_M(out[io], &c, in[tn], QDP_all);
	  nflops += 2*PEQM;
	}
      } else if(sdir>0) {
	int nu = sdir-1;
	int mu = topdir[io][s]-1;
	int sn = sidelinknum[io][s];
	QDP_ColorMatrix *Umunu = getU(tn, mu, nu);
	QDP_ColorMatrix *Unumu = getU(sn, nu, mu);
	QDP_M_eq_M_times_M(t1, in[sn], Umunu, QDP_all);
	if(c==1) {
	  QDP_M_peq_M_times_Ma(out[io], t1, Unumu, QDP_all);
	  nflops += EQMTM+PEQMTM;
	} else {
	  QDP_M_eq_M_times_Ma(t2, t1, Unumu, QDP_all);
	  QDP_M_peq_r_times_M(out[io], &c, t2, QDP_all);
	  nflops += 2*EQMTM+2*PEQM;
	}
      } else {
	int nu = -sdir-1;
	int mu = topdir[io][s]-1;
	int sn = sidelinknum[io][s];
	QDP_ColorMatrix *Unumu = getU(sn, nu, mu);
	QDP_M_eq_M_times_M(t1, in[tn], Unumu, QDP_all);
	QDP_M_eq_Ma_times_M(t2, in[sn], t1, QDP_all);
	QDP_ColorMatrix *tb = shiftb(t2, nu);
	if(c==1) {
	  QDP_M_peq_M(out[io], tb, QDP_all);
	  nflops += 2*EQMTM+PEQM;
	} else {
	  QDP_M_peq_r_times_M(out[io], &c, tb, QDP_all);
	  nflops += 2*EQMTM+2*PEQM;
	}
	QDP_discard_M(tb);
      }
    }
  }

  for(int i=0; i<nin; i++)
    for(int j=0; j<nd; j++)
      if(ftmps[i][j]!=NULL) QDP_destroy_M(ftmps[i][j]);
  for(int i=0; i<nd; i++) if(bt2[i]!=NULL) QDP_destroy_M(bt2[i]);
  QDP_destroy_M(t1);
  QDP_destroy_M(t2);
  info->final_sec = QOP_time() - dtime;
  info->final_flop = nflops*QDP_sites_on_node; 
  info->status = QOP_SUCCESS;
#undef NC
}
示例#4
0
// topdir = 1..nd
// sidedir = -nd..nd
// toplinknum,sidelinknum = 0..nin-1
void
QOP_staples_deriv(QOP_info_t *info, int nout, int nin,
		  QDP_ColorMatrix *deriv[], QDP_ColorMatrix *chain[],
		  QDP_ColorMatrix *in[],
		  int nstaples[], int *topdir[], int *sidedir[],
		  int *toplinknum[], int *sidelinknum[], QLA_Real *coef[])
{
#define NC QDP_get_nc(in[0])
  double dtime = QOP_time();
  double nflops = 0;
  int nd = QDP_ndim();
  QDP_ColorMatrix *ftmps[nin][nd], *t1, *t2, *t3, *t4, *tc, *bt2[nd], *bt3[nd], *ctmps[nd];
  int ctn[nd];
  for(int i=0; i<nin; i++)
    for(int j=0; j<nd; j++)
      ftmps[i][j] = NULL;
  for(int i=0; i<nd; i++) bt2[i] = bt3[i] = ctmps[i] = NULL;
  t1 = QDP_create_M();
  t2 = QDP_create_M();
  t3 = QDP_create_M();
  t4 = QDP_create_M();
  tc = QDP_create_M();

  // process in reverse in case calculated staples used as input for others
  for(int io=nout-1; io>=0; io--) {
    for(int i=0; i<nd; i++) {
      if(ctmps[i]) QDP_discard_M(ctmps[i]);
      ctn[i] = 0;
    }
    QDP_M_eq_M(tc, chain[io], QDP_all);
    for(int s=0; s<nstaples[io]; s++) {
      QLA_Real c = coef[io][s];
      int tn = toplinknum[io][s];
      int sdir = sidedir[io][s];
      //QOP_printf0("io: %i  s: %i  sdir: %i  tn: %i  c: %g\n", io, s, sdir, tn, c);
      if(sdir==0) {
	if(c==1) {
	  QDP_M_peq_M(deriv[tn], tc, QDP_all);
	  nflops += PEQM;
	} else {
	  QDP_M_peq_r_times_M(deriv[tn], &c, tc, QDP_all);
	  nflops += 2*PEQM;
	}
      } else if(sdir>0) {
	int nu = sdir-1;
	int mu = topdir[io][s]-1;
	int sn = sidelinknum[io][s];
	//QOP_printf0("  mu: %i  nu: %i  sn: %i\n", mu, nu, sn);
	QDP_ColorMatrix *Umunu = getU(tn, mu, nu);
	QDP_ColorMatrix *Unumu = getU(sn, nu, mu);
	QDP_M_eq_M_times_M(t1, in[sn], Umunu, QDP_all);
	QDP_M_eq_Ma_times_M(t2, tc, t1, QDP_all);
	QDP_ColorMatrix *tb2 = shiftb(t2, mu);
	QDP_M_eq_M_times_M(t1, tc, Unumu, QDP_all);
	QDP_M_eq_Ma_times_M(t3, in[sn], t1, QDP_all);
	QDP_ColorMatrix *tb3 = shiftb(t3, nu);
	if(c==1) {
	  QDP_M_peq_M_times_Ma(deriv[sn], t1, Umunu, QDP_all);
	  QDP_M_peq_M(deriv[sn], tb2, QDP_all);
	  QDP_M_peq_M(deriv[tn], tb3, QDP_all);
	  nflops += 4*EQMTM+PEQMTM+2*PEQM;
	} else {
	  QDP_M_eq_M_times_Ma(t4, t1, Umunu, QDP_all);
	  QDP_M_peq_r_times_M(deriv[sn], &c, t4, QDP_all);
	  QDP_M_peq_r_times_M(deriv[sn], &c, tb2, QDP_all);
	  QDP_M_peq_r_times_M(deriv[tn], &c, tb3, QDP_all);
	  nflops += 5*EQMTM+6*PEQM;
	}
	QDP_discard_M(tb2);
	QDP_discard_M(tb3);
      } else {
	int nu = -sdir-1;
	int mu = topdir[io][s]-1;
	int sn = sidelinknum[io][s];
	QDP_ColorMatrix *Cmunu = getC(nu);
	QDP_ColorMatrix *Unumu = getU(sn, nu, mu);
	QDP_M_eq_M_times_M(t1, in[sn], Cmunu, QDP_all);
	QDP_M_eq_Ma_times_M(t2, in[tn], t1, QDP_all);
	QDP_ColorMatrix *tb2 = shiftb(t2, mu);
	QDP_M_eq_M_times_M(t3, in[tn], Unumu, QDP_all);
	if(c==1) {
	  QDP_M_peq_M_times_Ma(deriv[tn], t1, Unumu, QDP_all);
	  QDP_M_peq_M_times_Ma(deriv[sn], t3, Cmunu, QDP_all);
	  QDP_M_peq_M(deriv[sn], tb2, QDP_all);
	  nflops += 3*EQMTM+2*PEQMTM+PEQM;
	} else {
	  QDP_M_eq_M_times_Ma(t4, t1, Unumu, QDP_all);
	  QDP_M_peq_r_times_M(deriv[tn], &c, t4, QDP_all);
	  QDP_M_eq_M_times_Ma(t4, t3, Cmunu, QDP_all);
	  QDP_M_peq_r_times_M(deriv[sn], &c, t4, QDP_all);
	  QDP_M_peq_r_times_M(deriv[sn], &c, tb2, QDP_all);
	  nflops += 5*EQMTM+6*PEQM;
	}
	QDP_discard_M(tb2);
      }
    }
  }

  for(int i=0; i<nin; i++)
    for(int j=0; j<nd; j++)
      if(ftmps[i][j]!=NULL) QDP_destroy_M(ftmps[i][j]);
  for(int i=0; i<nd; i++) {
    if(bt2[i]!=NULL) QDP_destroy_M(bt2[i]);
    if(bt3[i]!=NULL) QDP_destroy_M(bt3[i]);
    if(ctmps[i]!=NULL) QDP_destroy_M(ctmps[i]);
  }
  QDP_destroy_M(t1);
  QDP_destroy_M(t2);
  QDP_destroy_M(t3);
  QDP_destroy_M(t4);
  QDP_destroy_M(tc);
  info->final_sec = QOP_time() - dtime;
  info->final_flop = nflops*QDP_sites_on_node; 
  info->status = QOP_SUCCESS;
#undef NC
}
void 
QOPPC(symanzik_1loop_gauge_force1) (QOP_info_t *info, QOP_GaugeField *gauge, 
		   QOP_Force *force, QOP_gauge_coeffs_t *coeffs, REAL eps)
{
  REAL Plaq, Rect, Pgm ;
  QDP_ColorMatrix *tempmom_qdp[4];
  QDP_ColorMatrix *Amu[6]; // products of 2 links Unu(x)*Umu(x+nu)
  QDP_ColorMatrix *tmpmat;
  QDP_ColorMatrix *tmpmat1;
  QDP_ColorMatrix *tmpmat2;
  QDP_ColorMatrix *staples;
  QDP_ColorMatrix *tmpmat3;
  QDP_ColorMatrix *tmpmat4;

  int i, k;
  int mu, nu, sig;
  double dtime;
  //REAL eb3 = -eps*beta/3.0;
  REAL eb3 = -eps/3.0;
  int j[3][2] = {{1,2},
                 {0,2},
                 {0,1}};
  
  //  QOP_printf0("beta: %e, eb3: %e\n", beta, eb3);
  dtime = -QOP_time();

  for(mu=0; mu<4; mu++) {
    tempmom_qdp[mu] = QDP_create_M();
    QDP_M_eq_zero(tempmom_qdp[mu], QDP_all);
  }

  tmpmat = QDP_create_M();
  for(i=0; i<QOP_common.ndim; i++) {
    fblink[i] = gauge->links[i];
    fblink[OPP_DIR(i)] = QDP_create_M();
    QDP_M_eq_sM(tmpmat, fblink[i], QDP_neighbor[i], QDP_backward, QDP_all);
    QDP_M_eq_Ma(fblink[OPP_DIR(i)], tmpmat, QDP_all);
  }
  

  for(i=0; i<6; i++) {
    Amu[i] = QDP_create_M();
  }

  staples = QDP_create_M();
  tmpmat1 = QDP_create_M();
  tmpmat2 = QDP_create_M();
  tmpmat3 = QDP_create_M();
  tmpmat4 = QDP_create_M();

  Plaq = coeffs->plaquette;
  Rect = coeffs->rectangle;
  Pgm  = coeffs->parallelogram;

  //Construct 3-staples and rectangles
  for(mu=0; mu<4; mu++) {
    i=0;
    for(nu=0; nu<4; nu++) {
      if(nu!=mu){
	// tmpmat1 = Umu(x+nu)
	QDP_M_eq_sM(tmpmat1, fblink[mu], QDP_neighbor[nu], QDP_forward, QDP_all); 
        QDP_M_eq_M_times_M(Amu[i], fblink[nu], tmpmat1, QDP_all);

        //tmpmat2 = Umu(x-nu)
	QDP_M_eq_sM(tmpmat2, fblink[mu], QDP_neighbor[nu], QDP_backward, QDP_all);
        QDP_M_eq_M_times_M(Amu[i+3], fblink[OPP_DIR(nu)], tmpmat2, QDP_all);
       

 
	//tmpmat = U_{nu}(x+mu)
        QDP_M_eq_sM(tmpmat, fblink[nu], QDP_neighbor[mu], QDP_forward, QDP_all);
        QDP_M_eq_M_times_Ma(staples, Amu[i], tmpmat, QDP_all);        
        QDP_M_peq_r_times_M(tempmom_qdp[mu], &Plaq, staples, QDP_all);
 
        //tmpmat = U_{-nu}(x+mu)
        QDP_M_eq_sM(tmpmat, fblink[OPP_DIR(nu)], QDP_neighbor[mu], QDP_forward, QDP_all);
        QDP_M_eq_Ma_times_M(tmpmat3, fblink[OPP_DIR(nu)], staples, QDP_all);
        QDP_M_eq_M_times_M(tmpmat4, tmpmat3, tmpmat, QDP_all);
        QDP_M_eq_sM(tmpmat, tmpmat4, QDP_neighbor[nu], QDP_forward, QDP_all);
        QDP_M_peq_r_times_M(tempmom_qdp[mu], &Rect, tmpmat, QDP_all);

        QDP_M_eq_Ma_times_M(tmpmat4, tmpmat2, tmpmat3, QDP_all);
        QDP_M_eq_sM(tmpmat, tmpmat4, QDP_neighbor[nu], QDP_forward, QDP_all);
        QDP_M_eq_sM(tmpmat3, tmpmat, QDP_neighbor[mu], QDP_backward, QDP_all);
        QDP_M_peq_r_times_M(tempmom_qdp[nu], &Rect, tmpmat3, QDP_all);

        //tmpmat = U_{-nu}(x+mu)
        QDP_M_eq_sM(tmpmat, fblink[OPP_DIR(nu)], QDP_neighbor[mu], QDP_forward, QDP_all);
        QDP_M_eq_M_times_Ma(tmpmat3, tmpmat2, tmpmat, QDP_all);
        QDP_M_eq_M_times_Ma(tmpmat, tmpmat3, staples, QDP_all);        
        QDP_M_eq_sM(tmpmat3, tmpmat, QDP_neighbor[nu], QDP_forward, QDP_all);
        QDP_M_peq_r_times_M(tempmom_qdp[nu], &Rect, tmpmat3, QDP_all);




        //tmpmat = U_{-nu}(x+mu) 
        QDP_M_eq_sM(tmpmat, fblink[OPP_DIR(nu)], QDP_neighbor[mu], QDP_forward, QDP_all);
        QDP_M_eq_M_times_Ma(staples, Amu[i+3], tmpmat, QDP_all);        
        QDP_M_peq_r_times_M(tempmom_qdp[mu], &Plaq, staples, QDP_all);

        QDP_M_eq_Ma_times_M(tmpmat3, fblink[nu], staples, QDP_all);
        QDP_M_eq_sM(tmpmat, fblink[nu], QDP_neighbor[mu], QDP_forward, QDP_all);
        QDP_M_eq_M_times_M(tmpmat4, tmpmat3, tmpmat, QDP_all);
        QDP_M_eq_sM(tmpmat, tmpmat4, QDP_neighbor[nu], QDP_backward, QDP_all);
        QDP_M_peq_r_times_M(tempmom_qdp[mu], &Rect, tmpmat, QDP_all);

        QDP_M_eq_Ma_times_M(tmpmat, tmpmat3, tmpmat1, QDP_all);
        QDP_M_eq_sM(tmpmat4, tmpmat, QDP_neighbor[mu], QDP_backward, QDP_all);
        QDP_M_peq_r_times_M(tempmom_qdp[nu], &Rect, tmpmat4, QDP_all);

        QDP_M_eq_sM(tmpmat, fblink[nu], QDP_neighbor[mu], QDP_forward, QDP_all);
        QDP_M_eq_M_times_M(tmpmat3, staples, tmpmat, QDP_all);
        QDP_M_eq_M_times_Ma(tmpmat4, tmpmat3, tmpmat1, QDP_all);
        QDP_M_peq_r_times_M(tempmom_qdp[nu], &Rect, tmpmat4, QDP_all);
        i++;
      }
      
    }

    // Construct the  pgm staples and add them to force
    QDP_M_eq_zero(staples, QDP_all);
    i=0;
    for(nu=0; nu<4; nu++){
      if(nu!=mu){
        k=0;
	for(sig=0; sig<4;sig ++){
	  if(sig!=mu && nu!=sig){
	    
	    // the nu_sig_mu ... staple and 3 reflections
            //tmpmat = Amu["sig"](x+nu)
	    QDP_M_eq_sM(tmpmat, Amu[j[i][k]], QDP_neighbor[nu], QDP_forward, QDP_all);
            //tmpmat1 = Unu(x)*Amu["sig"](x+nu)
            QDP_M_eq_M_times_M(tmpmat1, fblink[nu], tmpmat, QDP_all);   
            //tmpmat3 = Unu(x+mu+sig)
            QDP_M_eq_sM(tmpmat, fblink[nu], QDP_neighbor[mu], QDP_forward, QDP_all);
	    QDP_M_eq_sM(tmpmat3, tmpmat, QDP_neighbor[sig], QDP_forward, QDP_all); // HERE?
            //tmpmat2 = Unu(x)*Amu["sig"](x+nu)*adj(Unu(x+mu+sig))
	    QDP_M_eq_M_times_Ma(tmpmat2, tmpmat1, tmpmat3, QDP_all);
            //tmpmat = Usig(x+mu)
	    QDP_M_eq_sM(tmpmat, fblink[sig], QDP_neighbor[mu], QDP_forward, QDP_all);
            //tmpmat1 = Unu(x)*Amu["sig"](x+nu)*adj(Unu(x+mu+sig))*adj(Usig(x+mu))
	    QDP_M_eq_M_times_Ma(tmpmat1, tmpmat2, tmpmat, QDP_all);

	    QDP_M_peq_M(staples, tmpmat1, QDP_all);


            //tmpmat = Amu["sig"](x-nu)
	    QDP_M_eq_sM(tmpmat, Amu[j[i][k]], QDP_neighbor[nu], QDP_backward, QDP_all);
            //tmpmat1 = U_{-nu}(x)*Amu["sig"](x-nu)
            QDP_M_eq_M_times_M(tmpmat1, fblink[OPP_DIR(nu)], tmpmat, QDP_all);   
            //tmpmat3 = U_{-nu}(x+mu+sig)
            QDP_M_eq_sM(tmpmat, fblink[OPP_DIR(nu)], QDP_neighbor[mu], QDP_forward, QDP_all);
	    QDP_M_eq_sM(tmpmat3, tmpmat, QDP_neighbor[sig], QDP_forward, QDP_all); // HERE?
            //tmpmat2 = U_{-nu}nu(x)*Amu["sig"](x-nu)*adj(Unu(x+mu+sig))
	    QDP_M_eq_M_times_Ma(tmpmat2, tmpmat1, tmpmat3, QDP_all);
            //tmpmat = Usig(x+mu)
	    QDP_M_eq_sM(tmpmat, fblink[sig], QDP_neighbor[mu], QDP_forward, QDP_all);
            //tmpmat1 = U_{-nu}(x)*Amu["sig"](x-nu)*adj(Unu(x+mu+sig))*adj(Usig(x+mu))
	    QDP_M_eq_M_times_Ma(tmpmat1, tmpmat2, tmpmat, QDP_all);

	    QDP_M_peq_M(staples, tmpmat1, QDP_all);


            //tmpmat = Amu["-sig"](x-nu)
	    QDP_M_eq_sM(tmpmat, Amu[j[i][k]+3], QDP_neighbor[nu], QDP_backward, QDP_all);
            //tmpmat1 = U_{-nu}(x)*Amu["-sig"](x-nu)
            QDP_M_eq_M_times_M(tmpmat1, fblink[OPP_DIR(nu)], tmpmat, QDP_all);   
            //tmpmat = U_{-nu}(x+mu-sig)
            QDP_M_eq_sM(tmpmat, fblink[OPP_DIR(nu)], QDP_neighbor[mu], QDP_forward, QDP_all);
	    QDP_M_eq_sM(tmpmat3, tmpmat, QDP_neighbor[sig], QDP_backward, QDP_all); // HERE?
            //tmpmat2 = U_{-nu}nu(x)*Amu["-sig"](x-nu)*adj(Unu(x+mu-sig))
	    QDP_M_eq_M_times_Ma(tmpmat2, tmpmat1, tmpmat3, QDP_all);
            //tmpmat = U_{-sig}(x+mu)
	    QDP_M_eq_sM(tmpmat, fblink[OPP_DIR(sig)], QDP_neighbor[mu], QDP_forward, QDP_all);
            //tmpmat1 = U_{-nu}(x)*Amu["-sig"](x-nu)*adj(Unu(x+mu-sig))*adj(U_{-sig}(x+mu))
	    QDP_M_eq_M_times_Ma(tmpmat1, tmpmat2, tmpmat, QDP_all);

	    QDP_M_peq_M(staples, tmpmat1, QDP_all);

            


            //tmpmat = Amu["-sig"](x+nu)
	    QDP_M_eq_sM(tmpmat, Amu[j[i][k]+3], QDP_neighbor[nu], QDP_forward, QDP_all);
            //tmpmat1 = Unu(x)*Amu["-sig"](x+nu)
            QDP_M_eq_M_times_M(tmpmat1, fblink[nu], tmpmat, QDP_all);   
            //tmpmat3 = Unu(x+mu-sig)
            QDP_M_eq_sM(tmpmat, fblink[nu], QDP_neighbor[mu], QDP_forward, QDP_all);
	    QDP_M_eq_sM(tmpmat3, tmpmat, QDP_neighbor[sig], QDP_backward, QDP_all); // HERE?
            //tmpmat2 = Unu(x)*Amu["-sig"](x+nu)*adj(Unu(x+mu-sig))
	    QDP_M_eq_M_times_Ma(tmpmat2, tmpmat1, tmpmat3, QDP_all);
            //tmpmat = U_{-sig}(x+mu)
	    QDP_M_eq_sM(tmpmat, fblink[OPP_DIR(sig)], QDP_neighbor[mu], QDP_forward, QDP_all);
            //tmpmat1 = Unu(x)*Amu["sig"](x+nu)*adj(Unu(x+mu+sig))*adj(Usig(x+mu))
	    QDP_M_eq_M_times_Ma(tmpmat1, tmpmat2, tmpmat, QDP_all);

	    QDP_M_peq_M(staples, tmpmat1, QDP_all);

	    k++;
	  }//close if sig!=nu ...
	}//close sig loop
	i++;
      }// close if nu!=mu
    }//close the pgm nu loop

    QDP_M_peq_r_times_M(tempmom_qdp[mu], &Pgm, staples, QDP_all);
   

    
  }// closes the mu loop

#ifdef CHKSUM
  QLA_ColorMatrix qcm;
  QLA_Complex det, chk;
  QLA_c_eq_r(chk, 0);
#endif
  for(mu=0; mu<4; mu++){
    QDP_M_eq_M_times_Ma(tmpmat, fblink[mu], tempmom_qdp[mu], QDP_all); // HERE?
    QDP_M_eq_r_times_M_plus_M( tempmom_qdp[mu], &eb3, tmpmat, force->force[mu], QDP_all);// HERE?
    QDP_M_eq_antiherm_M(force->force[mu], tempmom_qdp[mu], QDP_all);// HERE
#ifdef CHKSUM
    QDP_m_eq_sum_M(&qcm, force->force[mu], QDP_all);
    QLA_C_eq_det_M(&det, &qcm);
    QLA_c_peq_c(chk, det);
#endif
  }
#ifdef CHKSUM
  QOP_printf0("chksum: %g %g\n", QLA_real(chk), QLA_imag(chk));
#endif

  //DESTROY various fields

  QDP_destroy_M(tmpmat);
  QDP_destroy_M(tmpmat1);
  QDP_destroy_M(tmpmat2);
  QDP_destroy_M(tmpmat3);
  QDP_destroy_M(staples);
  QDP_destroy_M(tmpmat4);

  for(mu=0; mu<4; mu++){
    QDP_destroy_M(tempmom_qdp[mu]);
  }
  for(i=0; i<6; i++) {
    QDP_destroy_M(Amu[i]);
  }

  for(i=4; i<8; i++) {
    QDP_destroy_M(fblink[i]);
  }

  dtime += QOP_time();

  double nflop = 96720;
  info->final_sec = dtime;
  info->final_flop = nflop*QDP_sites_on_node; 
  info->status = QOP_SUCCESS;
  //QOP_printf0("Time in slow g_force: %e\n", info->final_sec);
} 
void 
QOP_hisq_force_multi_wrapper_fnmat(QOP_info_t *info,  
				   QOP_FermionLinksHisq *flh,
				   QOP_Force *Force, 
				   QOP_hisq_coeffs_t *hisq_coeff,
				   REAL *residues,
				   QDP_ColorVector *x[], 
				   int *n_orders_naik)
  
{
  double dtime = QDP_time();
  int i, ipath, dir;
  REAL coeff_mult;

  double *eps_naik = hisq_coeff->eps_naik;
  int n_naiks = hisq_coeff->n_naiks;
  QOP_hisq_unitarize_method_t umethod = hisq_coeff->umethod;

  // Quark paths sorted by net displacement and last directions
  static Q_path *q_paths_sorted_1 = NULL;
  static Q_path *q_paths_sorted_2 = NULL;
  static Q_path *q_paths_sorted_3 = NULL;

  static int *netbackdir_table_1 = NULL;
  static int *netbackdir_table_2 = NULL;
  static int *netbackdir_table_3 = NULL;

  static int first_force = 1;

  if(first_force == 1) 
    QOP_make_paths_and_dirs_hisq(hisq_coeff, umethod);

  int num_q_paths_1 = qop_get_num_q_paths_1();
  int num_q_paths_2 = qop_get_num_q_paths_2();
  int num_q_paths_3 = qop_get_num_q_paths_3();

  Q_path *q_paths_1 = qop_get_q_paths_1();
  Q_path *q_paths_2 = qop_get_q_paths_2();
  Q_path *q_paths_3 = qop_get_q_paths_3();

  Q_path *q_paths_sorted_current = NULL;
  int *netbackdir_table_current = NULL;

  int inaik;
  int n_naik_shift;
  double final_flop = 0.;
  size_t nflops = 0;

  QDP_ColorMatrix * force[4] =  {Force->force[0], Force->force[1], 
				 Force->force[2], Force->force[3]};

  int num_q_paths_current,n_orders_naik_current;//==nterms


  QDP_ColorMatrix *force_accum_0[4];
  QDP_ColorMatrix *force_accum_0_naik[4];
  QDP_ColorMatrix *force_accum_1[4];
  QDP_ColorMatrix *force_accum_1u[4];
  QDP_ColorMatrix *force_accum_2[4];
  QDP_ColorMatrix *force_final[4];


  QDP_ColorMatrix *Ugf[4], *Vgf[4], *Wgf[4];

  int nterms = 0, n_order_naik_total;

  for(inaik = 0; inaik < n_naiks; inaik++)
    nterms += n_orders_naik[inaik];
  n_order_naik_total = nterms;

  for(i=0;i<4;i++) {
    Ugf[i] = flh->U_links[i];
    Vgf[i] = flh->V_links[i];
    Wgf[i] = flh->W_unitlinks[i];
  }

  QDP_ColorMatrix *tmat;
  QDP_ColorMatrix *mat_tmp0;

  REAL treal;

  if( first_force==1 ){
    if( q_paths_sorted_1==NULL ) 
      q_paths_sorted_1 = (Q_path *)malloc( num_q_paths_1*sizeof(Q_path) );
    if(netbackdir_table_1==NULL ) 
      netbackdir_table_1 = (int *)malloc( num_q_paths_1*sizeof(int) );
    if( q_paths_sorted_2==NULL ) 
      q_paths_sorted_2 = (Q_path *)malloc( num_q_paths_2*sizeof(Q_path) );
    if(netbackdir_table_2==NULL ) 
      netbackdir_table_2 = (int *)malloc( num_q_paths_2*sizeof(int) );
    if( q_paths_sorted_3==NULL ) 
      q_paths_sorted_3 = (Q_path *)malloc( num_q_paths_3*sizeof(Q_path) );
    if(netbackdir_table_3==NULL ) 
      netbackdir_table_3 = (int *)malloc( num_q_paths_3*sizeof(int) );
    else{QOP_printf0("WARNING: remaking sorted path tables\n"); exit(0); }
    // make sorted tables
    sort_quark_paths_hisq( q_paths_1, q_paths_sorted_1, num_q_paths_1, 8 );

    for( ipath=0; ipath<num_q_paths_1; ipath++ )
      netbackdir_table_1[ipath] = 
	find_backwards_gather( &(q_paths_sorted_1[ipath]) );

    sort_quark_paths_hisq( q_paths_2, q_paths_sorted_2, num_q_paths_2, 16 );

    for( ipath=0; ipath<num_q_paths_2; ipath++ )
      netbackdir_table_2[ipath] = 
	find_backwards_gather( &(q_paths_sorted_2[ipath]) );

    sort_quark_paths_hisq( q_paths_3, q_paths_sorted_3, num_q_paths_3, 16 );

    for( ipath=0; ipath<num_q_paths_3; ipath++ )
      netbackdir_table_3[ipath] = 
	find_backwards_gather( &(q_paths_sorted_3[ipath]) );

    first_force=0;
  }

  tmat = QDP_create_M();
  mat_tmp0 = QDP_create_M();

  for(i=XUP;i<=TUP;i++){
     force_accum_0[i] = QDP_create_M();
     force_accum_0_naik[i] = QDP_create_M();
     force_accum_1[i] = QDP_create_M();
     force_accum_1u[i] = QDP_create_M();
     force_accum_2[i] = QDP_create_M();
     force_final[i] = QDP_create_M();
  }


  for(dir=XUP;dir<=TUP;dir++)
    QDP_M_eq_zero(force_accum_2[dir], QDP_all);


  // loop on different naik masses
  n_naik_shift = 0;


  for( inaik=0; inaik<n_naiks; inaik++ ) {

    // smearing level 0
    if( 0==inaik ) {
      n_orders_naik_current = n_order_naik_total;
    }
    else {
      n_orders_naik_current = n_orders_naik[inaik];
    }
    

    QOP_hisq_force_multi_smearing0_fnmat(info,residues+n_naik_shift, 
					 x+n_naik_shift, n_orders_naik_current,
					 force_accum_0, force_accum_0_naik);
    final_flop += info->final_flop;
 
    
    // smearing level 2
    if( 0==inaik ) {
      q_paths_sorted_current = q_paths_sorted_2;
      num_q_paths_current = num_q_paths_2;
      netbackdir_table_current = netbackdir_table_2;
    }
    else {
      q_paths_sorted_current = q_paths_sorted_3;
      num_q_paths_current = num_q_paths_3;
      netbackdir_table_current = netbackdir_table_3;
    }
    
    QOP_hisq_force_multi_smearing_fnmat( info,Wgf,residues+n_naik_shift, 
					 x+n_naik_shift, 
					 n_orders_naik_current, 
					 force_accum_1, 
					 force_accum_0, force_accum_0_naik, 
					 num_q_paths_current, 
					 q_paths_sorted_current, 
					 netbackdir_table_current );
    //QOP_printf0("HISQ smear0 flops = %g\n", info->final_flop);
    final_flop += info->final_flop;

    if( 0==inaik ) {
      coeff_mult = 1.0;
    }
    else {
      coeff_mult = eps_naik[inaik];
    }
    
    
    for(dir=XUP;dir<=TUP;dir++) {
      QDP_M_peq_r_times_M(force_accum_2[dir],&coeff_mult,
			  force_accum_1[dir],QDP_all);
      nflops += 36;
    }
    n_naik_shift += n_orders_naik[inaik];


  }

 

  if ( umethod==QOP_UNITARIZE_NONE ){

    // smearing level 1
    QOP_hisq_force_multi_smearing_fnmat( info,Ugf,residues, 
					 x, 
					 nterms, force_accum_1, 
					 force_accum_2, NULL, 
					 num_q_paths_1, 
					 q_paths_sorted_1, 
					 netbackdir_table_1 );
    final_flop += info->final_flop;
    
  }
  else if ( umethod==QOP_UNITARIZE_RATIONAL ){

    
    // reunitarization
    QOP_hisq_force_multi_reunit(info,Vgf,force_accum_1u,
				force_accum_2);
    //QOP_printf0("reunit flops = %g\n", info->final_flop);
    final_flop += info->final_flop;
    
    // smearing level 1
    QOP_hisq_force_multi_smearing_fnmat( info,Ugf,residues, 
					 x, 
					 nterms, force_accum_1, 
					 force_accum_1u, NULL, 
					 num_q_paths_1, 
					 q_paths_sorted_1, 
					 netbackdir_table_1 );
    //QOP_printf0("HISQ smear1 flops = %g\n", info->final_flop);
    final_flop += info->final_flop;
  }
  else
    {
      QOP_printf0("Unknown or unsupported unitarization method\n");
      exit(1);
      
    }


  // contraction with the link in question should be done here,
  // after contributions from all levels of smearing are taken into account

  for(dir=XUP;dir<=TUP;dir++){

    QDP_M_eq_M_times_M(force_final[dir],Ugf[dir],force_accum_1[dir],QDP_all);
    nflops += 198;

  }



  // take into account even/odd parity (it is NOT done in "smearing" routine)
  //eps multiplication done outside QOP 

  for(dir=XUP;dir<=TUP;dir++){
    QDP_M_eq_M(tmat,force_final[dir],QDP_all);

    treal = 2.0;
    QDP_M_eq_r_times_M(force_final[dir],&treal,tmat,QDP_even);

    treal = -2.0;
    QDP_M_eq_r_times_M(force_final[dir],&treal,tmat,QDP_odd);
    nflops += 18;

  }


  // Put antihermitian traceless part into momentum 
  // add force to momentum

  for(dir=XUP; dir<=TUP; dir++){

    QDP_M_eq_antiherm_M(mat_tmp0, force_final[dir], QDP_all);
    QDP_M_peq_M(force[dir], mat_tmp0, QDP_all);
    nflops += 24+18;
    //QDP_M_peq_M(force_final[dir], force[dir], QDP_all);
    //QDP_M_eq_antiherm_M(force[dir], force_final[dir], QDP_all);

  }



  for(i=XUP;i<=TUP;i++){
     QDP_destroy_M( force_accum_0[i] );
     QDP_destroy_M( force_accum_0_naik[i] );
     QDP_destroy_M( force_accum_1[i] );
     QDP_destroy_M( force_accum_1u[i] );
     QDP_destroy_M( force_accum_2[i] );
     QDP_destroy_M( force_final[i] );
  }

  QDP_destroy_M( tmat );
  QDP_destroy_M( mat_tmp0 );

  final_flop += ((double)nflops)*QDP_sites_on_node;

  info->final_sec = QDP_time() - dtime;
  info->final_flop = final_flop;
  info->status = QOP_SUCCESS;
  //QOP_printf0("HISQ force flops = %g\n", info->final_flop);
} //hisq_force_multi_wrapper_fnmat
static void
get_staple_plaq(QDP_ColorMatrix *staple, int mu, QDP_ColorMatrix *u[],
		QOP_gauge_coeffs_t *coeffs,
		QDP_Subset subset, QDP_Subset osubset)
{
#define NC QDP_get_nc(staple)
  QDP_Lattice *lat = QDP_get_lattice_M(staple);
  int nd = QDP_ndim_L(lat);
  QDP_Shift *neighbor = QDP_neighbor_L(lat);
  QLA_Real plaq = coeffs->plaquette;
  QLA_Real adpl = coeffs->adjoint_plaquette;
#if 1
  QDP_ColorMatrix *temp1, *temp2, *temp3, *temp4, *temp5, *temp6;

  //temp1 = QDP_create_M();
  temp2 = QDP_create_M_L(lat);
  //temp3 = QDP_create_M();
  temp4 = QDP_create_M_L(lat);
  //temp5 = QDP_create_M();
  temp6 = QDP_create_M_L(lat);
  QDP_Complex *tc = NULL;
  if(adpl!=0) tc = QDP_create_C_L(lat);

  /* staple += u[nu](x) u[mu](x+nu) u*[nu](x+mu)
   *         + u*[nu](x-nu) u[mu](x-nu) u[nu](x-nu+mu) */
  for(int nu=0; nu<nd; nu++) {
    if (nu == mu) continue;

    temp1 = QDP_create_M_L(lat);
    temp3 = QDP_create_M_L(lat);
    temp5 = QDP_create_M_L(lat);

    QDP_M_eq_sM(temp1, u[nu], neighbor[mu], QDP_forward, QDP_all_L(lat));
    QDP_M_eq_Ma_times_M(temp2, u[nu], u[mu], osubset);
    QDP_M_eq_sM(temp3, u[mu], neighbor[nu], QDP_forward, subset);
    QDP_M_eq_M_times_M(temp4, temp2, temp1, osubset);
    QDP_M_eq_sM(temp5, temp4, neighbor[nu], QDP_backward, subset);
    QDP_M_eq_M_times_M(temp6, u[nu], temp3, subset);
    //QDP_M_peq_M_times_Ma(staple, temp6, temp1, subset);
    //QDP_M_peq_M(staple, temp5, subset);

    if(adpl==0) {
      QDP_M_peq_M_times_Ma(temp5, temp6, temp1, subset);
      QDP_M_peq_r_times_M(staple, &plaq, temp5, subset);
    } else {
      QLA_Complex z;
      QLA_c_eq_r(z, plaq/adpl);

      QDP_C_eq_c(tc, &z, subset);
      QDP_M_eq_M_times_Ma(temp2, temp6, temp1, subset);
      QDP_C_peq_M_dot_M(tc, temp2, u[mu], subset);
      QDP_C_eq_r_times_C(tc, &adpl, tc, subset);
      QDP_M_peq_C_times_M(staple, tc, temp2, subset);

      QDP_C_eq_c(tc, &z, subset);
      QDP_C_peq_M_dot_M(tc, temp5, u[mu], subset);
      QDP_C_eq_r_times_C(tc, &adpl, tc, subset);
      QDP_M_peq_C_times_M(staple, tc, temp5, subset);
    }

    //QDP_discard_M(temp1);
    //QDP_discard_M(temp3);
    //QDP_discard_M(temp5);
    QDP_destroy_M(temp1);
    QDP_destroy_M(temp3);
    QDP_destroy_M(temp5);

  }  /* closes nu loop */

  if(adpl!=0) QDP_destroy_C(tc);
  //QDP_destroy_M(temp1);
  QDP_destroy_M(temp2);
  //QDP_destroy_M(temp3);
  QDP_destroy_M(temp4);
  //QDP_destroy_M(temp5);
  QDP_destroy_M(temp6);
#else
  QDP_ColorMatrix *t = QDP_create_M_L(lat);
  int nu, path[3];
  QDP_Subset subs[2];
  subs[0] = subset;
  subs[1] = osubset;
  for(nu=0; nu<nd; nu++) {
    if (nu == mu) continue;
    path[0] = 1+nu;
    path[1] = -(1+mu);
    path[2] = -(1+nu);
    path_prod(u, t, path, 3, 1, subs, neighsubeo);
    QDP_M_peq_M(staple, t, subset);
    path[0] = -(1+nu);
    path[1] = -(1+mu);
    path[2] = 1+nu;
    path_prod(u, t, path, 3, 1, subs, neighsubeo);
    QDP_M_peq_M(staple, t, subset);
  }
  QDP_destroy_M(t);
#endif
}