// like link_transport, except doesn't multiply by link matrices.  
// use this, for example,
// when storing the intermediate HISQ force (a connection) at the lattice site
// associated with a link
static void 
link_gather_connection_qdp( QDP_ColorMatrix *dest, 
			    QDP_ColorMatrix *src,
			    QDP_ColorMatrix *work,
			    int dir ){



  if (dir >= 8) //3 link shift needed
    {
      dir=dir-8;

      //do initial 2 shifts
      if( GOES_FORWARDS(dir) ) {
	
	QDP_M_eq_sM(dest, src, QDP_neighbor[dir], QDP_forward, QDP_all);
	QDP_M_eq_sM(work, dest, QDP_neighbor[dir], QDP_forward, QDP_all);

      }
      else { /* GOES_BACKWARDS(dir) */
	
	QDP_M_eq_sM(dest, src, QDP_neighbor[OPP_DIR(dir)], 
		    QDP_backward, QDP_all);
	QDP_M_eq_sM(work, dest, QDP_neighbor[OPP_DIR(dir)], 
		    QDP_backward, QDP_all);

      }
    }
  else{ //only 1 link shift needed

    QDP_M_eq_M(work, src,  QDP_all);

  }

 

  //do final shift
  if( GOES_FORWARDS(dir) ) {

    QDP_M_eq_sM(dest, work, QDP_neighbor[dir], QDP_forward, QDP_all);

  }
  else { /* GOES_BACKWARDS(dir) */

    QDP_M_eq_sM(dest, work, QDP_neighbor[OPP_DIR(dir)], QDP_backward, QDP_all);

  }


} /* link_gather_connection_qdp */
/*  The 3 flavor version of side_link_force used *
 * to optimize fermion transports                */
static void
side_link_forces(int mu, int nu, REAL coeff[], QDP_ColorVector **Path,
		 QDP_ColorVector **Path_nu, QDP_ColorVector **Path_mu,
		 QDP_ColorVector **Path_numu, int nsrc)
{
  REAL m_coeff[nsrc];
  int i;

  for(i=0; i<nsrc; i++) {
    m_coeff[i] = -coeff[i];
  }

  if(GOES_FORWARDS(mu))
    {
      /*                    nu           * 
       * Add the force :  +----+         *
       *               mu |    |         *
       *                  x    (x)       *
       *                  o    o         */
      if(GOES_FORWARDS(nu))
	add_forces_to_mom(Path_numu, Path, mu, coeff, nsrc);
      else
	//add_forces_to_mom(Path,Path_numu,OPP_DIR(mu),m_coeff, nsrc);
	add_forces_to_mom(Path_numu,Path,mu,m_coeff, nsrc);
    }
  else /*GOES_BACKWARDS(mu)*/
    {
      /* Add the force :  o    o         *
       *               mu |    |         *
       *                  x    (x)       *
       *                  +----+         *
       *                    nu           */ 
      if(GOES_FORWARDS(nu))
	add_forces_to_mom(Path_nu, Path_mu, mu, m_coeff, nsrc);
      else
	add_forces_to_mom(Path_mu, Path_nu, OPP_DIR(mu), coeff, nsrc);
    }
}
static int 
find_backwards_gather( Q_path *path ){
  int disp[4], i;
  /* compute total displacement of path */
  for(i=XUP;i<=TUP;i++)disp[i]=0;
  for( i=0; i<path->length; i++){
    if( GOES_FORWARDS(path->dir[i]) )
      disp[        path->dir[i]  ]++;
    else
      disp[OPP_DIR(path->dir[i]) ]--;
  }
  
  // There must be an elegant way??
  if( disp[XUP]==+1 && disp[YUP]== 0 && disp[ZUP]== 0 && disp[TUP]== 0 )
    return(XDOWN);
  if( disp[XUP]==-1 && disp[YUP]== 0 && disp[ZUP]== 0 && disp[TUP]== 0 )
    return(XUP);
  if( disp[XUP]== 0 && disp[YUP]==+1 && disp[ZUP]== 0 && disp[TUP]== 0 )
    return(YDOWN);
  if( disp[XUP]== 0 && disp[YUP]==-1 && disp[ZUP]== 0 && disp[TUP]== 0 )
    return(YUP);
  if( disp[XUP]== 0 && disp[YUP]== 0 && disp[ZUP]==+1 && disp[TUP]== 0 )
    return(ZDOWN);
  if( disp[XUP]== 0 && disp[YUP]== 0 && disp[ZUP]==-1 && disp[TUP]== 0 )
    return(ZUP);
  if( disp[XUP]== 0 && disp[YUP]== 0 && disp[ZUP]== 0 && disp[TUP]==+1 )
    return(TDOWN);
  if( disp[XUP]== 0 && disp[YUP]== 0 && disp[ZUP]== 0 && disp[TUP]==-1 )
    return(TUP);
  
  if( disp[XUP]==+3 && disp[YUP]== 0 && disp[ZUP]== 0 && disp[TUP]== 0 )
    return(X3DOWN);
  if( disp[XUP]==-3 && disp[YUP]== 0 && disp[ZUP]== 0 && disp[TUP]== 0 )
    return(X3UP);
  if( disp[XUP]== 0 && disp[YUP]==+3 && disp[ZUP]== 0 && disp[TUP]== 0 )
    return(Y3DOWN);
  if( disp[XUP]== 0 && disp[YUP]==-3 && disp[ZUP]== 0 && disp[TUP]== 0 )
    return(Y3UP);
  if( disp[XUP]== 0 && disp[YUP]== 0 && disp[ZUP]==+3 && disp[TUP]== 0 )
    return(Z3DOWN);
  if( disp[XUP]== 0 && disp[YUP]== 0 && disp[ZUP]==-3 && disp[TUP]== 0 )
    return(Z3UP);
  if( disp[XUP]== 0 && disp[YUP]== 0 && disp[ZUP]== 0 && disp[TUP]==+3 )
    return(T3DOWN);
  if( disp[XUP]== 0 && disp[YUP]== 0 && disp[ZUP]== 0 && disp[TUP]==-3 )
    return(T3UP);
  QOP_printf0("OOOPS: NODIR\n"); exit(0);
  return( NODIR );
} //find_backwards_gather
/* special case to transport a "connection" by one link, does both parities */
static void 
link_transport_connection_qdp( QDP_ColorMatrix *dest, QDP_ColorMatrix *src,
			       QDP_ColorMatrix *gf[4], QDP_ColorMatrix *work,
                               QDP_ColorMatrix *st[8], int dir ){
  if( GOES_FORWARDS(dir) ) {
    QDP_M_eq_M(work, src, QDP_all);
    QDP_M_eq_sM(st[dir], work, QDP_neighbor[dir], QDP_forward, QDP_all);
    QDP_M_eq_M_times_M(dest, gf[dir], st[dir], QDP_all);
    QDP_discard_M(st[dir]);
  }
  else { /* GOES_BACKWARDS(dir) */
    QDP_M_eq_Ma_times_M(work, gf[OPP_DIR(dir)], src, QDP_all);
    QDP_M_eq_sM(st[dir], work, QDP_neighbor[OPP_DIR(dir)], 
		QDP_backward,QDP_all);
    QDP_M_eq_M(dest, st[dir], QDP_all);
    QDP_discard_M(st[dir]);
  }
} /* link_transport_connection_qdp */
void
QOP_asqtad_force_multi_asvec_qdp(QOP_info_t *info, QDP_ColorMatrix *links[],
				 QDP_ColorMatrix *force[], QOP_asqtad_coeffs_t *coef,
				 REAL eps[], QDP_ColorVector *xin[], int nsrc)
{
#define NC QDP_get_nc(xin[0])
  REAL coeff[nsrc];
  REAL OneLink[nsrc], Lepage[nsrc], Naik[nsrc], FiveSt[nsrc], ThreeSt[nsrc], SevenSt[nsrc];
  REAL mNaik[nsrc], mLepage[nsrc], mFiveSt[nsrc], mThreeSt[nsrc], mSevenSt[nsrc];

  QDP_ColorVector *P3[8][nsrc];

  QDP_ColorVector *P5[8][nsrc];
  QDP_ColorVector *P5tmp[8][8][nsrc];
  QDP_ColorVector *P5s[4][nsrc];
  QDP_ColorVector *P5tmps[4][8][nsrc];

  //QDP_ColorVector *xin[nsrc];
  QDP_ColorVector *xintmp[8][nsrc];
  QDP_ColorVector *Pmu[nsrc];
  QDP_ColorVector *Pmutmp[8][nsrc];
  QDP_ColorVector *Pnumu[nsrc];
  QDP_ColorVector *Pnumutmp[8][nsrc];
  QDP_ColorVector *Prhonumu[nsrc];
  QDP_ColorVector *Prhonumutmp[8][nsrc];
  QDP_ColorVector *P7[nsrc];
  QDP_ColorVector *P7tmp[8][nsrc];
  QDP_ColorVector *P7rho[nsrc];
  QDP_ColorVector *ttv[nsrc];

  int i, dir;
  int mu, nu, rho, sig;

  double nflop1 = 253935;
  double nflop2 = 433968;
  double nflop = nflop1 + (nflop2-nflop1)*(nsrc-1);
  double dtime;
  dtime = -QOP_time();

  ASQTAD_FORCE_BEGIN;

  QOP_trace("test 1\n");
  /* setup parallel transport */
  QDP_ColorMatrix *tmpmat = QDP_create_M();
  for(i=0; i<QOP_common.ndim; i++) {
    fbshift[i] = QDP_neighbor[i];
    fbshiftdir[i] = QDP_forward;
    fblink[i] = links[i];
    fbshift[OPP_DIR(i)] = QDP_neighbor[i];
    fbshiftdir[OPP_DIR(i)] = QDP_backward;
    fblink[OPP_DIR(i)] = QDP_create_M();
    QDP_M_eq_sM(tmpmat, fblink[i], QDP_neighbor[i], QDP_backward, QDP_all);
    QDP_M_eq_Ma(fblink[OPP_DIR(i)], tmpmat, QDP_all);
  }

  tv = ttv;
  for(i=0; i<nsrc; i++) {
    tv[i] = QDP_create_V();
  }

  QOP_trace("test 2\n");
  /* Allocate temporary vectors */
  for(i=0; i<nsrc; i++) {
    Pmu[i] = QDP_create_V();
    Pnumu[i] = QDP_create_V();
    Prhonumu[i] = QDP_create_V();
    P7[i] = QDP_create_V();
    P7rho[i] = QDP_create_V();
    for(dir=0; dir<8; dir++) {
      xintmp[dir][i] = QDP_create_V();
      Pmutmp[dir][i] = QDP_create_V();
      Pnumutmp[dir][i] = QDP_create_V();
      Prhonumutmp[dir][i] = QDP_create_V();
      P7tmp[dir][i] = QDP_create_V();
    }
#if 1
    for(mu=0; mu<4; mu++) {
      P5s[mu][i] = QDP_create_V();
      for(dir=0; dir<8; dir++) {
	P5tmps[mu][dir][i] = QDP_create_V();
      }
    }
#else
    for(mu=0; mu<8; mu++) {
      P5[mu][i] = QDP_create_V();
      for(dir=0; dir<8; dir++) {
	P5tmp[mu][dir][i] = QDP_create_V();
	//printf("%p %p\n", P5tmp[mu][dir][i], &(P5tmp[mu][dir][i])); fflush(stdout);
	if(P5tmp[mu][dir][i]==NULL) {
	  fprintf(stderr, "error: can't create V\n");
	  QDP_abort();
	}
      }
    }
#endif
  }
  //printf("%p\n", P5tmp[0][4][0]); fflush(stdout);

  for(mu=0; mu<8; mu++) {
    for(i=0; i<nsrc; i++) {
      P3[mu][i] = QDP_create_V();
      //P5[mu][i] = QDP_create_V();
    }
  }

  for(mu=0; mu<4; mu++) {
    tempmom_qdp[mu] = force[mu];
    QDP_M_eqm_M(tempmom_qdp[mu], tempmom_qdp[mu], QDP_odd);
  }

  /* Path coefficients times fermion epsilon */
  /* Load path coefficients from table */
  for(i=0; i<nsrc; i++) {
    OneLink[i] = coef->one_link     * eps[i];
    Naik[i]    = coef->naik         * eps[i]; mNaik[i]    = -Naik[i];
    ThreeSt[i] = coef->three_staple * eps[i]; mThreeSt[i] = -ThreeSt[i];
    FiveSt[i]  = coef->five_staple  * eps[i]; mFiveSt[i]  = -FiveSt[i];
    SevenSt[i] = coef->seven_staple * eps[i]; mSevenSt[i] = -SevenSt[i];
    Lepage[i]  = coef->lepage       * eps[i]; mLepage[i]  = -Lepage[i];
  }

#if 0
  printf("nsrc = %i\n", nsrc);
  printf("coeffs = %g %g %g %g %g %g\n", OneLink[0], ThreeSt[0], FiveSt[0],
	 SevenSt[0], Lepage[0], Naik[0]);
#endif

  /* *************************************** */

  QOP_trace("start force loop\n");
  for(mu=0; mu<8; mu++) {
    //u_shift_hw_fermion(temp_x_qdp, Pmu, OPP_DIR(mu), temp_hw[OPP_DIR(mu)]);
    u_shift_color_vecs(xin, Pmu, OPP_DIR(mu), nsrc, xintmp[OPP_DIR(mu)]);

    for(sig=0; sig<8; sig++) if( (sig!=mu)&&(sig!=OPP_DIR(mu)) ) {
      //u_shift_hw_fermion(Pmu, P3[sig], sig, temp_hw[sig]);
      u_shift_color_vecs(Pmu, P3[sig], sig, nsrc, Pmutmp[sig]);

      if(GOES_FORWARDS(sig)) {
	/* Add the force F_sig[x+mu]:         x--+             *
	 *                                   |   |             *
	 *                                   o   o             *
	 * the 1 link in the path: - (numbering starts form 0) */
	add_forces_to_mom(P3[sig], Pmu, sig, mThreeSt, nsrc);
      }
    }

    for(nu=0; nu<8; nu++) if( (nu!=mu)&&(nu!=OPP_DIR(mu)) ) {
      int nP5 = 0;
      //Pnumu = hw_qdp[OPP_DIR(nu)];
      //u_shift_hw_fermion(Pmu, Pnumu, OPP_DIR(nu), temp_hw[OPP_DIR(nu)]);
      u_shift_color_vecs(Pmu, Pnumu, OPP_DIR(nu), nsrc, Pmutmp[OPP_DIR(nu)]);
      //QDP_V_veq_V(Pnumu, P3[OPP_DIR(nu)], QDP_all, nsrc);
      for(sig=0; sig<8; sig++) if( (sig!=mu)&&(sig!=OPP_DIR(mu)) &&
				   (sig!=nu)&&(sig!=OPP_DIR(nu)) ) {
#if 1
	for(i=0; i<nsrc; i++) {
	  P5[sig][i] = P5s[nP5][i];
	  for(dir=0; dir<8; dir++) P5tmp[sig][dir][i] = P5tmps[nP5][dir][i];
	}
#endif
	nP5++;
	//u_shift_hw_fermion(Pnumu, P5[sig], sig, temp_hw[sig]);
	u_shift_color_vecs(Pnumu, P5[sig], sig, nsrc, Pnumutmp[sig]);

	if(GOES_FORWARDS(sig)) {
	  /* Add the force F_sig[x+mu+nu]:      x--+             *
	   *                                   |   |             *
	   *                                   o   o             *
	   * the 2 link in the path: + (numbering starts form 0) */
	  add_forces_to_mom(P5[sig], Pnumu, sig, FiveSt, nsrc);
	}
      }
      QOP_trace("test 4\n");
      for(rho=0; rho<8; rho++) if( (rho!=mu)&&(rho!=OPP_DIR(mu)) &&
				   (rho!=nu)&&(rho!=OPP_DIR(nu)) ) {
	//Prhonumu = hw_qdp[OPP_DIR(rho)];
	//u_shift_hw_fermion(Pnumu, Prhonumu, OPP_DIR(rho), 
	//		 temp_hw[OPP_DIR(rho)] );
	  u_shift_color_vecs(Pnumu, Prhonumu, OPP_DIR(rho), nsrc,
			     Pnumutmp[OPP_DIR(rho)]);
	  //QDP_V_veq_V(Prhonumu, P5[OPP_DIR(rho)], QDP_all, nsrc);
	for(sig=0; sig<8; sig++) if( (sig!=mu )&&(sig!=OPP_DIR(mu )) &&
				     (sig!=nu )&&(sig!=OPP_DIR(nu )) &&
				     (sig!=rho)&&(sig!=OPP_DIR(rho)) ) {
	  /* Length 7 paths */
	  //P7 = hw_qdp[sig];
	  //u_shift_hw_fermion(Prhonumu, P7, sig, temp_hw[sig] );
  QOP_trace("test 43\n");
	  u_shift_color_vecs(Prhonumu, P7, sig, nsrc, Prhonumutmp[sig]);
  QOP_trace("test 44\n");
	  //QDP_V_eq_r_times_V(P7[0], &SevenSt[0], P7[0], QDP_all);
	  //QDP_V_eq_r_times_V(P7[1], &SevenSt[1], P7[1], QDP_all);
	  if(GOES_FORWARDS(sig)) {
	    /* Add the force F_sig[x+mu+nu+rho]:  x--+             *
	     *                                   |   |             *
	     *                                   o   o             *
	     * the 3 link in the path: - (numbering starts form 0) */
  QOP_trace("test 45\n");
	    add_forces_to_mom(P7, Prhonumu, sig, mSevenSt, nsrc);
  QOP_trace("test 46\n");
	    //mom_meq_force(P7, Prhonumu, sig);
	  }
	  /* Add the force F_rho the 2(4) link in the path: +     */
	  //P7rho = hw_qdp[rho];
	  //u_shift_hw_fermion(P7, P7rho, rho, temp_hw[rho]);
  QOP_trace("test 47\n");
	  u_shift_color_vecs(P7, P7rho, rho, nsrc, P7tmp[rho]);
  QOP_trace("test 48\n");
	  side_link_forces(rho,sig,SevenSt,Pnumu,P7,Prhonumu,P7rho, nsrc);
  QOP_trace("test 49\n");
	  //side_link_3f_force2(rho,sig,Pnumu,P7,Prhonumu,P7rho);
	  /* Add the P7rho vector to P5 */
	  for(i=0; i<nsrc; i++) {
	    if(FiveSt[i]!=0) coeff[i] = SevenSt[i]/FiveSt[i];
	    else coeff[i] = 0;
  QOP_trace("test 410\n");
	    QDP_V_peq_r_times_V(P5[sig][i], &coeff[i], P7rho[i], QDP_all);
  QOP_trace("test 411\n");
	  }
	} /* sig */
      } /* rho */
  QOP_trace("test 5\n");
#define P5nu P7
      for(sig=0; sig<8; sig++) if( (sig!=mu)&&(sig!=OPP_DIR(mu)) &&
				   (sig!=nu)&&(sig!=OPP_DIR(nu)) ) {
	/* Length 5 paths */
	/* Add the force F_nu the 1(3) link in the path: -     */
	//P5nu = hw_qdp[nu];
	//u_shift_hw_fermion(P5[sig], P5nu, nu, temp_hw[nu]);
	u_shift_color_vecs(P5[sig], P5nu, nu, nsrc, P5tmp[sig][nu]);
	side_link_forces(nu, sig, mFiveSt, Pmu, P5[sig], Pnumu, P5nu, nsrc);
	/* Add the P5nu vector to P3 */
	for(i=0; i<nsrc; i++) {
	  if(ThreeSt[i]!=0) coeff[i] = FiveSt[i]/ThreeSt[i]; 
	  else coeff[i] = 0;
	  QDP_V_peq_r_times_V(P3[sig][i], &coeff[i], P5nu[i], QDP_all);
	}
      } /* sig */
    } /* nu */

#define Pmumu Pnumu
#define Pmumutmp Pnumutmp
#define P5sig Prhonumu
#define P5sigtmp Prhonumutmp
#define P3mu P7
#define Popmu P7
#define Pmumumu P7
    /* Now the Lepage term... It is the same as 5-link paths with
       nu=mu and FiveSt=Lepage. */
    //u_shift_hw_fermion(Pmu, Pmumu, OPP_DIR(mu), temp_hw[OPP_DIR(mu)] );
    u_shift_color_vecs(Pmu, Pmumu, OPP_DIR(mu), nsrc, Pmutmp[OPP_DIR(mu)]);

    for(sig=0; sig<8; sig++) if( (sig!=mu)&&(sig!=OPP_DIR(mu)) ) {
      //P5sig = hw_qdp[sig];
      //u_shift_hw_fermion(Pmumu, P5sig, sig, temp_hw[sig]);
      u_shift_color_vecs(Pmumu, P5sig, sig, nsrc, Pmumutmp[sig]);
      if(GOES_FORWARDS(sig)) {
	/* Add the force F_sig[x+mu+nu]:      x--+             *
	 *                                   |   |             *
	 *                                   o   o             *
	 * the 2 link in the path: + (numbering starts form 0) */
	add_forces_to_mom(P5sig, Pmumu, sig, Lepage, nsrc);
      }
      /* Add the force F_nu the 1(3) link in the path: -     */
      //P5nu = hw_qdp[mu];
      //u_shift_hw_fermion(P5sig, P5nu, mu, temp_hw[mu]);
      u_shift_color_vecs(P5sig, P5nu, mu, nsrc, P5sigtmp[mu]);
      side_link_forces(mu, sig, mLepage, Pmu, P5sig, Pmumu, P5nu, nsrc);
      /* Add the P5nu vector to P3 */
      for(i=0; i<nsrc; i++) {
	if(ThreeSt[i]!=0) coeff[i] = Lepage[i]/ThreeSt[i];
	else coeff[i] = 0;
	QDP_V_peq_r_times_V(P3[sig][i], &coeff[i], P5nu[i], QDP_all);
      }

      /* Length 3 paths (Not the Naik term) */
      /* Add the force F_mu the 0(2) link in the path: +     */
      if(GOES_FORWARDS(mu)) {
	//P3mu = hw_qdp[mu];  /* OK to clobber P5nu */
	//u_shift_hw_fermion(P3[sig], P3mu, mu, temp_hw[mu]);
	//u_shift_color_vecs(P3[sig], P3mu, mu, 2, temp_hw[mu]);
	for(i=0; i<nsrc; i++) {
	  QDP_V_eq_V(P5sig[i], P3[sig][i], QDP_all);
	}
	u_shift_color_vecs(P5sig, P3mu, mu, nsrc, P5sigtmp[mu]);
      }
      /* The above shift is not needed if mu is backwards */
      side_link_forces(mu, sig, ThreeSt, xin, P3[sig], Pmu, P3mu, nsrc);
    }

    /* Finally the OneLink and the Naik term */
    if(GOES_BACKWARDS(mu)) {
      /* Do only the forward terms in the Dslash */
      /* Because I have shifted with OPP_DIR(mu) Pmu is a forward *
       * shift.                                                   */
      /* The one link */
      add_forces_to_mom(Pmu, xin, OPP_DIR(mu), OneLink, nsrc);
      /* For the same reason Pmumu is the forward double link */

      /* Popmu is a backward shift */
      //Popmu = hw_qdp[mu]; /* OK to clobber P3mu */
      //u_shift_hw_fermion(xin, Popmu, mu, temp_hw[mu]);
      u_shift_color_vecs(xin, Popmu, mu, nsrc, xintmp[mu]);
      /* The Naik */
      /* link no 1: - */
      add_forces_to_mom(Pmumu, Popmu, OPP_DIR(mu), mNaik, nsrc);
      /* Pmumumu can overwrite Popmu which is no longer needed */
      //Pmumumu = hw_qdp[OPP_DIR(mu)];
      //u_shift_hw_fermion(Pmumu, Pmumumu, OPP_DIR(mu), temp_hw[OPP_DIR(mu)]);
      u_shift_color_vecs(Pmumu, Pmumumu, OPP_DIR(mu), nsrc, Pmumutmp[OPP_DIR(mu)]);
      /* link no 0: + */
      add_forces_to_mom(Pmumumu, xin, OPP_DIR(mu), Naik, nsrc);
    } else {
      /* The rest of the Naik terms */
      //Popmu = hw_qdp[mu]; /* OK to clobber P3mu */
      //u_shift_hw_fermion(xin, Popmu, mu, temp_hw[mu]);
      u_shift_color_vecs(xin, Popmu, mu, nsrc, xintmp[mu]);
      /* link no 2: + */
      /* Pmumu is double backward shift */
      add_forces_to_mom(Popmu, Pmumu, mu, Naik, nsrc);
    }
    /* Here we have to do together the Naik term and the one link term */

  }/* mu */
  QOP_trace("test 6\n");
  QOP_trace("test 7\n");

  for(mu=0; mu<4; mu++) {
    QDP_M_eq_M(tmpmat, tempmom_qdp[mu], QDP_even);
    QDP_M_eqm_M(tmpmat, tempmom_qdp[mu], QDP_odd);
    QDP_M_eq_antiherm_M(tempmom_qdp[mu], tmpmat, QDP_all);
  }
  QDP_destroy_M(tmpmat);

  //printf("%p\n", P5tmp[0][4][0]); fflush(stdout);
  //if(QDP_this_node==0) { printf("line %i\n",__LINE__); fflush(stdout); }
  /* Free temporary vectors */
  for(i=0; i<nsrc; i++) {
    QDP_destroy_V(Pmu[i]);
    QDP_destroy_V(Pnumu[i]);
    QDP_destroy_V(Prhonumu[i]);
    QDP_destroy_V(P7[i]);
    QDP_destroy_V(P7rho[i]);
    //if(QDP_this_node==0) { printf("line %i\n",__LINE__); fflush(stdout); }
    for(dir=0; dir<8; dir++) {
      QDP_destroy_V(xintmp[dir][i]);
      QDP_destroy_V(Pmutmp[dir][i]);
      QDP_destroy_V(Pnumutmp[dir][i]);
      QDP_destroy_V(Prhonumutmp[dir][i]);
      QDP_destroy_V(P7tmp[dir][i]);
    }
    //if(QDP_this_node==0) { printf("line %i\n",__LINE__); fflush(stdout); }
    for(mu=0; mu<4; mu++) {
      //if(QDP_this_node==0) { printf("line %i\n",__LINE__); fflush(stdout); }
      QDP_destroy_V(P5s[mu][i]);
      //QDP_destroy_V(P5[mu][i]);
      //if(QDP_this_node==0) { printf("line %i\n",__LINE__); fflush(stdout); }
      for(dir=0; dir<8; dir++) {
	//if(QDP_this_node==0) { printf("line %i\n",__LINE__); fflush(stdout); }
	QDP_destroy_V(P5tmps[mu][dir][i]);
	//printf("%p\n", P5tmp[mu][dir][i]); fflush(stdout);
	//QDP_destroy_V(P5tmp[mu][dir][i]);
	//if(QDP_this_node==0) { printf("line %i\n",__LINE__); fflush(stdout); }
      }
      //if(QDP_this_node==0) { printf("line %i\n",__LINE__); fflush(stdout); }
    }
    //if(QDP_this_node==0) { printf("line %i\n",__LINE__); fflush(stdout); }
  }

  //if(QDP_this_node==0) { printf("here3\n"); fflush(stdout); }
  for(mu=0; mu<8; mu++) {
    for(i=0; i<nsrc; i++) {
      QDP_destroy_V(P3[mu][i]);
    }
    //QDP_destroy_V(P5[mu][0]);
    //QDP_destroy_V(P5[mu][1]);
  }

  for(i=0; i<nsrc; i++) {
    QDP_destroy_V(tv[i]);
  }

  //if(QDP_this_node==0) { printf("here4\n"); fflush(stdout); }
  for(i=4; i<8; i++) {
    QDP_destroy_M(fblink[i]);
  }

  dtime += QOP_time();
  info->final_sec = dtime;
  info->final_flop = nflop*QDP_sites_on_node;
  info->status = QOP_SUCCESS;

  ASQTAD_FORCE_END;
#undef NC
}
static su3_matrix *create_longlinks_qop_milc(QOP_info_t *info, 
					     QOP_asqtad_coeffs_t *coeffs,
					     QOP_GaugeField *gauge) {
  register int i;
  register site *s;
  int ipath,dir;
  int disp[4];
  int num_q_paths = ks_act_paths.num_q_paths;
  Q_path *q_paths = ks_act_paths.q_paths;
  register su3_matrix *long1;
  su3_matrix *staple, *tempmat1;
  int nflop = 1804;
  double dtime;
  su3_matrix *t_ll;
  char myname[] = "create_longlinks_qop_milc";
  
  dtime=-dclock();

  if( phases_in != 1){
    node0_printf("BOTCH: %s needs phases in\n",myname);
    terminate(0);
  }
  /* Allocate space for t_longlink if NULL */
  t_ll = (su3_matrix *)special_alloc(sites_on_node*4*sizeof(su3_matrix));
  if(t_ll==NULL){
    printf("NODE %d: no room for t_ll\n",this_node);
    terminate(1);
  }
  
  staple = (su3_matrix *)special_alloc(sites_on_node*sizeof(su3_matrix));
  if(staple == NULL){
    printf("%s: Can't malloc temporary\n",myname);
    terminate(1);
  }

  tempmat1 = (su3_matrix *)special_alloc(sites_on_node*sizeof(su3_matrix));
  if(tempmat1 == NULL){
    printf("%s: Can't malloc temporary\n",myname);
    terminate(1);
  }

  for (dir=XUP; dir<=TUP; dir++){ /* loop over longlink directions */
    /* set longlink to zero */
    FORALLSITES(i,s){
      long1 = &(t_ll[4*i+dir]);
      clear_su3mat( long1 );
    }

    /* loop over paths, checking for ones with total displacement 3*dir */
    for( ipath=0; ipath<num_q_paths; ipath++ ){  /* loop over paths */
	/* compute total displacement of path */
	for(i=XUP;i<=TUP;i++)disp[i]=0;
	for( i=0; i<q_paths[ipath].length; i++){
	  if( GOES_FORWARDS(q_paths[ipath].dir[i]) )
	    disp[        q_paths[ipath].dir[i]  ]++;
	  else
	    disp[OPP_DIR(q_paths[ipath].dir[i]) ]--;
	}
	for( disp[dir]+=3,i=XUP; i<=TUP; i++)if(disp[i]!=0)break;
	if( i<=TUP )continue;  /* skip if path doesn't go to right place */
/**printf("ipath = %d, found a path:  ",ipath);
for(j=0;j<q_paths[ipath].length;j++)printf("\t%d", q_paths[ipath].dir[j]);
printf("\n");**/

	path_product_qop_milc( q_paths[ipath].dir, q_paths[ipath].length, 
			       tempmat1, gauge );
	FORALLSITES(i,s){
	  su3_adjoint( &tempmat1[i], &staple[i] );
	  long1 = &(t_ll[4*i+dir]);
          scalar_mult_add_su3_matrix( long1,
	    &staple[i], -q_paths[ipath].coeff, long1 );
		/* minus sign in coeff. because we used backward path*/
	}
    } /* ipath */
/* Smearing level i*/
static void 
QOP_hisq_force_multi_smearing_fnmat(QOP_info_t *info, 
				    QDP_ColorMatrix * gf[4],
				    REAL *residues,
				    QDP_ColorVector *x[], 
				    int nterms, 
				    QDP_ColorMatrix *force_accum[4],
				    QDP_ColorMatrix *force_accum_old[4],
				    QDP_ColorMatrix *force_accum_naik_old[4],
				    int internal_num_q_paths,
				    Q_path *internal_q_paths_sorted,
				    int *internal_netbackdir_table)
{
  int i,j,k,lastdir=-99,ipath,ilink;
  int length,dir,odir;
  REAL coeff;

  QDP_ColorMatrix *tmat;
  QDP_ColorMatrix *oprod_along_path[MAX_PATH_LENGTH+1];
  QDP_ColorMatrix *mats_along_path[MAX_PATH_LENGTH+1];
  QDP_ColorMatrix *mat_tmp0,*mat_tmp1, *stmp[8];;
  QDP_ColorVector *vec_tmp[2];

  int netbackdir;
  size_t nflops = 0;

// table of net path displacements (backwards from usual convention)

  Q_path *this_path;	// pointer to current path

  /* Allocate fields */
  for(i=0;i<=MAX_PATH_LENGTH;i++){
    oprod_along_path[i] = QDP_create_M();
  }
  for(i=1;i<=MAX_PATH_LENGTH;i++){ 
    // 0 element is never used (it's unit matrix)
    mats_along_path[i] = QDP_create_M();
  }

  mat_tmp0   = QDP_create_M();
  mat_tmp1   = QDP_create_M();
  for(i=0; i<8; i++) stmp[i] = QDP_create_M();
  tmat       = QDP_create_M();
  vec_tmp[0] = QDP_create_V();
  vec_tmp[1] = QDP_create_V();
 
  // clear force accumulators
  for(dir=XUP;dir<=TUP;dir++)
    QDP_M_eq_zero(force_accum[dir], QDP_all);

  // loop over paths, and loop over links in path 
  for( ipath=0; ipath<internal_num_q_paths; ipath++ ){
    this_path = &(internal_q_paths_sorted[ipath]); 
    if(this_path->forwback== -1)continue;	// skip backwards dslash 
    length = this_path->length;
    netbackdir = internal_netbackdir_table[ipath];

    // move f(i-1) force from current site in positive direction,
    //  this corresponds to outer product |X><Y| calculated at the endpoint of the path 
    if( netbackdir<8) { // Not a Naik path
      link_gather_connection_qdp(oprod_along_path[0] , 
				 force_accum_old[OPP_DIR(netbackdir)],
				 tmat, netbackdir );
    }
    else { // Naik path
      if( NULL==force_accum_naik_old ) {
        QOP_printf0( "hisq_force_multi_smearing_fnmat:  mismatch:\n" );
        QOP_printf0( "force_accum_naik_old is NULL, but path table contains Naik paths(!)\n" );
        exit(0);
      }
      // CONVERSION FROM 3-LINK DIRECTION TO 1-LINK DIRECTION
      link_gather_connection_qdp(oprod_along_path[0] , 
				 force_accum_naik_old[OPP_DIR(netbackdir-8)],
				 tmat, netbackdir );
    }

    // figure out how much of the outer products along the path must be
    // recomputed. j is last one needing recomputation. k is first one.
    j=length-1; // default is recompute all
    if( GOES_BACKWARDS(this_path->dir[0]) ) k=1; else k=0;

    for(ilink=j;ilink>=k;ilink--){
      link_transport_connection_qdp( oprod_along_path[length-ilink], 
				     oprod_along_path[length-ilink-1], gf,
				     mat_tmp0, stmp, this_path->dir[ilink]  );
      nflops += 198;
    }

    // maintain an array of transports "to this point" along the path.
    //	Don't recompute beginning parts of path if same as last path 
    ilink=0; // first link where new transport is needed
    // Sometimes we don't need the matrix for the last link
    if( GOES_FORWARDS(this_path->dir[length-1]) ) k=length-1; else k=length;

    for( ; ilink<k; ilink++ ){
      if( ilink==0 ){
        dir = this_path->dir[0];
	if( GOES_FORWARDS(dir) ){
	  QDP_M_eq_sM(tmat, gf[dir], QDP_neighbor[dir],
		      QDP_backward, QDP_all);
	  QDP_M_eq_Ma(mats_along_path[1], tmat, QDP_all);
	  QDP_discard_M(tmat);
	}
	else{
	  QDP_M_eq_M(mats_along_path[1], gf[OPP_DIR(dir)], QDP_all);
	}
      }
      else { // ilink != 0
        dir = OPP_DIR(this_path->dir[ilink]);

	link_transport_connection_qdp( mats_along_path[ilink+1], 
				       mats_along_path[ilink], gf,
				       mat_tmp0, stmp, dir );
	nflops += 198;
      }
    } // end loop over links

    // A path has (length+1) points, counting the ends.  At first
    //	 point, no "down" direction links have their momenta "at this
    //	 point". At last, no "up" ... 
    if( GOES_FORWARDS(this_path->dir[length-1]) ) k=length-1; else k=length;
    for( ilink=0; ilink<=k; ilink++ ){
      if(ilink<length)dir = this_path->dir[ilink];
      else dir=NODIR;
      coeff = this_path->coeff;
      if( (ilink%2)==1 )coeff = -coeff;
      // add in contribution to the force 
      if( ilink<length && GOES_FORWARDS(dir) ){
	link_gather_connection_qdp(mat_tmp1, 
		       oprod_along_path[length-ilink-1], tmat, dir );
        if(ilink==0) 
	  {
	    QDP_M_eq_M(mat_tmp0,mat_tmp1,QDP_all);
	  }
        else
	  {
	    QDP_M_eq_M_times_Ma(mat_tmp0, mats_along_path[ilink], 
				mat_tmp1, QDP_all);
	    nflops += 198;
	    QDP_M_eq_Ma(mat_tmp1,mat_tmp0,QDP_all);
	  }
	QDP_M_peq_r_times_M(force_accum[dir],&coeff,mat_tmp1,QDP_all);
	nflops += 36;
      }
      if( ilink>0 && GOES_BACKWARDS(lastdir) ){
	odir = OPP_DIR(lastdir);
        if( ilink==1 ){
	  QDP_M_eq_M(mat_tmp0,oprod_along_path[length-ilink],QDP_all);
	  QDP_M_eq_Ma(mat_tmp1,mat_tmp0,QDP_all);
	}
        else{
	  link_gather_connection_qdp(mat_tmp1, mats_along_path[ilink-1], 
				     tmat, odir );
	  QDP_M_eq_M_times_Ma(mat_tmp0, oprod_along_path[length-ilink], 
			      mat_tmp1, QDP_all);
	  nflops += 198;
	  QDP_M_eq_Ma(mat_tmp1, mat_tmp0, QDP_all);
        }
	QDP_M_peq_r_times_M(force_accum[odir],&coeff,mat_tmp1,QDP_all);
	nflops += 36;
      }
      lastdir = dir;
    } // end loop over links in path //
  } // end loop over paths //

  QDP_destroy_V( vec_tmp[0] );
  QDP_destroy_V( vec_tmp[1] );
  QDP_destroy_M( mat_tmp0 );
  QDP_destroy_M( mat_tmp1 );
  QDP_destroy_M( tmat );
  for(i=0; i<8; i++) QDP_destroy_M(stmp[i]);
  for(i=0;i<=MAX_PATH_LENGTH;i++){
    QDP_destroy_M( oprod_along_path[i] );
  }
  for(i=1;i<=MAX_PATH_LENGTH;i++){
    QDP_destroy_M( mats_along_path[i] );
  }

  info->final_flop = ((double)nflops)*QDP_sites_on_node;

  return;
}//hisq_force_multi_smearing_fnmat
/* update the momenta with the gauge force */
void QOP_symanzik_1loop_gauge_force(QOP_info_t *info, QOP_GaugeField *gauge, 
		    QOP_Force *force, QOP_gauge_coeffs_t *coeffs, Real eps)
{
    register int i,dir;
    register site *st;
    su3_matrix tmat1;
    register Real eb3;    /* Note: eps now includes eps*beta */
    register su3_matrix* momentum;
    su3_matrix *staple, *tempmat1;

    /* lengths of various kinds of loops */
    int *loop_length = get_loop_length();
    /* number of rotations/reflections  for each kind */
    int *loop_num = get_loop_num();
    /* table of directions, 1 for each kind of loop */
    int ***loop_table = get_loop_table();
    /* table of coefficients in action, for various "representations"
	(actually, powers of the trace) */
    Real **loop_coeff = get_loop_coeff(); /* We make our own */
    int max_length = get_max_length(); /* For Symanzik 1 loop! */
    int nloop = get_nloop();
    int nreps = get_nreps();
    su3_matrix *forwardlink[4];
    su3_matrix *tmpmom[4];

    int nflop = 153004;  /* For Symanzik1 action */
    Real final_flop;
    double dtime;
    int j,k;
    int *dirs,length;
    int *path_dir,path_length;

    int ln,iloop;
    Real action,act2,new_term;

    int ncount;
    char myname[] = "imp_gauge_force";

    dtime=-dclock();

    info->status = QOP_FAIL;

    /* Parity requirements */
    if(gauge->evenodd != QOP_EVENODD ||
       force->evenodd != QOP_EVENODD
       )
      {
	printf("QOP_asqtad_force: Bad parity gauge %d force %d\n",
	       gauge->evenodd, force->evenodd);
	return;
      }

    /* Map field pointers to local static pointers */
    
    FORALLUPDIR(dir){
      forwardlink[dir] = gauge->g + dir*sites_on_node;
      tmpmom[dir]  = force->f + dir*sites_on_node;
    }
    /* Check loop coefficients */

    if(coeffs->plaquette != loop_coeff[0][0] ||
       coeffs->rectangle != loop_coeff[1][0] ||
       coeffs->parallelogram != loop_coeff[2][0])
      {
	printf("%s(%d): Path coeffs don't match\n",myname,this_node);
	return;
      }

    /* Allocate arrays according to action */
    dirs = (int *)malloc(max_length*sizeof(int));
    if(dirs == NULL){
      printf("%s(%d): Can't malloc dirs\n",myname,this_node);
      return;
    }

    path_dir = (int *)malloc(max_length*sizeof(int));
    if(path_dir == NULL){
      printf("%s(%d): Can't malloc path_dir\n",myname,this_node);
      return;
    }
    staple = (su3_matrix *)special_alloc(sites_on_node*sizeof(su3_matrix));
    if(staple == NULL){
      printf("%s(%d): Can't malloc temporary\n",myname,this_node);
      return;
    }

    tempmat1 = (su3_matrix *)special_alloc(sites_on_node*sizeof(su3_matrix));
    if(tempmat1 == NULL){
      printf("%s(%d): Can't malloc temporary\n",myname,this_node);
      return;
    }

    eb3 = eps/3.0;

    /* Loop over directions, update mom[dir] */
    for(dir=XUP; dir<=TUP; dir++){

	FORALLSITES(i,st)for(j=0;j<3;j++)for(k=0;k<3;k++){
			staple[i].e[j][k]=cmplx(0.0,0.0);
	} END_LOOP

	ncount=0;
	for(iloop=0;iloop<nloop;iloop++){
	    length=loop_length[iloop];
	    for(ln=0;ln<loop_num[iloop];ln++){
/**printf("UPD:  "); printpath( loop_table[iloop][ln], length );**/
		/* set up dirs.  we are looking at loop starting in "XUP"
		   direction, rotate so it starts in "dir" direction. */
		for(k=0;k<length;k++){
                    if( GOES_FORWARDS(loop_table[iloop][ln][k]) ){
                	dirs[k]=(dir+loop_table[iloop][ln][k] )% 4;
		    }
            	    else {
                        dirs[k]=OPP_DIR(
			    (dir+OPP_DIR(loop_table[iloop][ln][k]))%4 );
		    }
		}

		path_length= length-1;  /* generalized "staple" */

		/* check for links in direction of momentum to be
		   updated, each such link gives a contribution. Note
		   the direction of the path - opposite the link. */
		for(k=0;k<length;k++)if( dirs[k]==dir||dirs[k]==OPP_DIR(dir)) {
		    if( GOES_FORWARDS(dirs[k]) ) for(j=0;j<path_length;j++) {
			path_dir[j] = dirs[(k+j+1)%length];
		    }
		    if( GOES_BACKWARDS(dirs[k]) ) for(j=0;j<path_length;j++) {
			path_dir[path_length-1-j] =
			    OPP_DIR(dirs[(k+j+1)%length]);
		    }
/**if(dir==XUP)printf("X_UPDATE PATH: "); printpath( path_dir, path_length );**/
		    path_product(path_dir,path_length, tempmat1);

		    /* We took the path in the other direction from our
			old convention in order to get it to end up
			"at our site", so now take adjoint */
		    /* then compute "single_action" contribution to
			staple */
		    FORALLSITES(i,st){
			su3_adjoint( &(tempmat1[i]), &tmat1 );
			/* first we compute the fundamental term */
			new_term = loop_coeff[iloop][0];

			/* now we add in the higher representations */
			if(nreps > 1){
node0_printf("WARNING: THIS CODE IS NOT TESTED\n"); exit(0);
			    act2=1.0;
			    action = 3.0 - realtrace_su3(forwardlink[dir]+i,
			      &tmat1 ); 

			    for(j=1;j<nreps;j++){
				act2 *= action;
				new_term +=
				    loop_coeff[iloop][j]*act2*(Real)(j+1);
			    }
			}  /* end if nreps > 1 */

			scalar_mult_add_su3_matrix( &(staple[i]), &tmat1,
				new_term, &(staple[i]) );

		    } END_LOOP

		    ncount++;

		} /* k (location in path) */
	    } /* ln */
	} /* iloop */

	/* Now multiply the staple sum by the link, then update momentum */
	FORALLSITES(i,st){
	    mult_su3_na( forwardlink[dir]+i, &(staple[i]), &tmat1 );
	    momentum = tmpmom[dir] + i;
	    scalar_mult_sub_su3_matrix( momentum, &tmat1,
		eb3, momentum );
	} END_LOOP
/*-------------------------------------------------------------------*/
void
load_lnglinks(info_t *info, su3_matrix *lng, ks_component_paths *p,
	      su3_matrix *links ) {
  register int i;
  
  int ipath,dir;
  int disp[4];
  int num_q_paths = p->num_q_paths;
  Q_path *q_paths = p->q_paths;
  register su3_matrix *long1;
  su3_matrix *staple = NULL, *tempmat1 = NULL;
  char myname[] = "load_lnglinks";
  double dtime = -dclock();

  if( phases_in != 1){
    node0_printf("BOTCH: %s needs phases in\n",myname);
    terminate(0);
  }

  staple = create_m_special();
  tempmat1 = create_m_special();

  for (dir=XUP; dir<=TUP; dir++){ /* loop over longlink directions */
    /* set longlink to zero */
    FORALLFIELDSITES_OMP(i,private(long1)){
      long1 = lng + 4*i +dir;
      clear_su3mat( long1 );
    } END_LOOP_OMP;

    /* loop over paths, checking for ones with total displacement 3*dir */
    for( ipath=0; ipath<num_q_paths; ipath++ ){  /* loop over paths */
	/* compute total displacement of path */
	for(i=XUP;i<=TUP;i++)disp[i]=0;
	for( i=0; i<q_paths[ipath].length; i++){
	  if( GOES_FORWARDS(q_paths[ipath].dir[i]) )
	    disp[        q_paths[ipath].dir[i]  ]++;
	  else
	    disp[OPP_DIR(q_paths[ipath].dir[i]) ]--;
	}
	for( disp[dir]+=3,i=XUP; i<=TUP; i++)if(disp[i]!=0)break;
	if( i<=TUP )continue;  /* skip if path doesn't go to right place */
/**printf("ipath = %d, found a path:  ",ipath);
for(j=0;j<q_paths[ipath].length;j++)printf("\t%d", q_paths[ipath].dir[j]);
printf("\n");**/

//	path_product_field( q_paths[ipath].dir, q_paths[ipath].length, 
//			    tempmat1, links );
	path_product_fields( links, q_paths[ipath].dir, q_paths[ipath].length, 
			     tempmat1 );
	FORALLFIELDSITES(i){
	  su3_adjoint( &tempmat1[i], &staple[i] );
	  long1 = lng + 4*i + dir;
          scalar_mult_add_su3_matrix( long1,
	    &staple[i], -q_paths[ipath].coeff, long1 );
		/* minus sign in coeff. because we used backward path*/
	}
    } /* ipath */

  } /* loop over directions */


  destroy_m_special(staple); staple = NULL;
  destroy_m_special(tempmat1); tempmat1 = NULL;


  dtime += dclock();
  info->final_sec = dtime;
  info->final_flop = 1728.*volume/numnodes();  /* (formerly 1804) */

}  /* load_lnglinks() */
void
load_fatlinks_cpu(info_t *info, su3_matrix *fat, ks_component_paths *p, 
		  su3_matrix *links){
  register int i;
  int dir;
  register su3_matrix *fat1;
  su3_matrix *staple = NULL, *tempmat1 = NULL;
  char myname[] = "load_fatlinks_cpu";

#ifdef ASQ_OPTIMIZED_FATTENING
  int  nu,rho,sig ;
  Real one_link;
#else
  int ipath;
  int disp[4];
  int num_q_paths = p->num_q_paths;
  Q_path *q_paths = p->q_paths;
#endif

  double dtime = -dclock();

  staple = (su3_matrix *)special_alloc(sites_on_node*sizeof(su3_matrix));
  if(staple == NULL){
    printf("%s: Can't malloc temporary\n",myname);
    terminate(1);
  }

  tempmat1 = (su3_matrix *)special_alloc(sites_on_node*sizeof(su3_matrix));
  if(tempmat1 == NULL){
    printf("%s: Can't malloc temporary\n",myname);
    terminate(1);
  }

#ifndef  ASQ_OPTIMIZED_FATTENING   /* general case code */
  for (dir=XUP; dir<=TUP; dir++){ /* loop over fatlink directions */
    /* set fatlink to zero */
    FORALLFIELDSITES(i){
      fat1 = fat + 4*i + dir;
      clear_su3mat( fat1 );
    }
    
    /* loop over paths, checking for ones with total displacement 1*dir */
    for( ipath=0; ipath<num_q_paths; ipath++ ){  /* loop over paths */
	/* compute total displacement of path */
	for(i=XUP;i<=TUP;i++)disp[i]=0;
	for( i=0; i<q_paths[ipath].length; i++){
	  if( GOES_FORWARDS(q_paths[ipath].dir[i]) )
	    disp[        q_paths[ipath].dir[i]  ]++;
	  else
	    disp[OPP_DIR(q_paths[ipath].dir[i]) ]--;
	}
	for( disp[dir]+=1,i=XUP; i<=TUP; i++)if(disp[i]!=0)break;
	if( i<=TUP )continue;  /* skip if path doesn't go to right place */
/**printf("dir = %d, found a path:  ",dir);
for(j=0;j<q_paths.[ipath].length;j++)printf("\t%d", q_paths[ipath].dir[j]);
printf("\n");**/

//	path_product( q_paths[ipath].dir, q_paths[ipath].length, tempmat1 );
//	path_product_field( q_paths[ipath].dir, q_paths[ipath].length, 
//			    tempmat1, links );
	path_product_fields( links, q_paths[ipath].dir, q_paths[ipath].length, 
			     tempmat1 );
	FORALLFIELDSITES(i){
	  su3_adjoint( &tempmat1[i], &staple[i] );
	  fat1 = fat +  4*i + dir;
          scalar_mult_add_su3_matrix( fat1,
	    &staple[i], -q_paths[ipath].coeff, fat1 );
		/* minus sign in coeff. because we used backward path*/
	}
    } /* ipath */
  } /* loop over directions */
#else	/* ASQ_OPTIMIZED_FATTENING, for Asq and Asqtad actions */
/*  Optimized fattening code for the Asq and Asqtad actions.           *
 * I assume that path 0 is the one link path 2 the 3-staple            *
 * path 3 the 5-staple path 4 the 7-staple and path 5 the Lepage term. *
 * Path 1 is the Naik term.                                            */
 
 /* to fix up the Lepage term, included by a trick below */
 one_link = (p->act_path_coeff.one_link - 6.0*p->act_path_coeff.lepage);
 
 for (dir=XUP; dir<=TUP; dir++){
   FORALLFIELDSITES(i) /* Intialize fat links with c_1*U_\mu(x) */
     {
       fat1 = fat +  4*i + dir;
       scalar_mult_su3_matrix(links + 4*i + dir, one_link,
			      fat1 );
     }
   /* Skip the rest of the calculation if the remaining coefficients vanish */
   if( p->act_path_coeff.three_staple == 0.0 &&
       p->act_path_coeff.lepage == 0.0 &&
       p->act_path_coeff.five_staple == 0.0)continue;

   for(nu=XUP; nu<=TUP; nu++) if(nu!=dir)
     {
//       compute_gen_staple_site(staple,dir,nu,F_OFFSET(link[dir]),
//			       *t_fl, act_path_coeff.three_staple);

       compute_gen_staple_field(staple, dir, nu, links + dir, 4,
				fat, p->act_path_coeff.three_staple, links);
       /* The Lepage term */
       /* Note this also involves modifying c_1 (above) */
       compute_gen_staple_field(NULL, dir, nu, staple, 1,
				fat, p->act_path_coeff.lepage, links);
       for(rho=XUP; rho<=TUP; rho++) if((rho!=dir)&&(rho!=nu))
	 {
	   compute_gen_staple_field( tempmat1, dir, rho, staple, 1,
				     fat, p->act_path_coeff.five_staple, links);
	   for(sig=XUP; sig<=TUP; sig++)
	     if((sig!=dir)&&(sig!=nu)&&(sig!=rho))
	       {
		 compute_gen_staple_field(NULL,dir,sig,tempmat1, 1,
				  fat, p->act_path_coeff.seven_staple, links);
	       } /* sig */
	 } /* rho */
     } /* nu */
 }/* dir */  
#endif

 special_free(staple);  staple = NULL;
 special_free(tempmat1); tempmat1 = NULL;

 dtime += dclock();
 info->final_sec += dtime;
 info->final_flop = 61632.*volume/numnodes();
 if( p->act_path_coeff.three_staple == 0.0 &&
     p->act_path_coeff.lepage == 0.0 &&
     p->act_path_coeff.five_staple == 0.0)
   info->final_flop = 72.*volume/numnodes();

}  /* load_fatlinks_cpu */
示例#11
0
/* update the momenta with the gauge force */
void imp_gauge_force_cpu( Real eps, field_offset mom_off ){
    register int i,dir;
    register site *st;
    su3_matrix tmat1,tmat2;
    register Real eb3;
    register anti_hermitmat* momentum;
    su3_matrix *staple, *tempmat1;

    /* lengths of various kinds of loops */
    int *loop_length = get_loop_length();
    /* number of rotations/reflections  for each kind */
    int *loop_num = get_loop_num();
    /* table of directions, 1 for each kind of loop */
    int ***loop_table = get_loop_table();
    /* table of coefficients in action, for various "representations"
	(actually, powers of the trace) */
    Real **loop_coeff = get_loop_coeff();
    int max_length = get_max_length();
    int nloop = get_nloop();
    int nreps = get_nreps();

#ifdef GFTIME
    int nflop = 153004;  /* For Symanzik1 action */
    double dtime;
#endif
    int j,k;
    int *dirs,length;
    int *path_dir,path_length;

    int ln,iloop;
    Real action,act2,new_term;

    int ncount;
    char myname[] = "imp_gauge_force";

#ifdef GFTIME
    dtime=-dclock();
#endif

    dirs = (int *)malloc(max_length*sizeof(int));
    if(dirs == NULL){
      printf("%s(%d): Can't malloc dirs\n",myname,this_node);
      terminate(1);
    }
    path_dir = (int *)malloc(max_length*sizeof(int));
    if(path_dir == NULL){
      printf("%s(%d): Can't malloc path_dir\n",myname,this_node);
      terminate(1);
    }
    staple = (su3_matrix *)special_alloc(sites_on_node*sizeof(su3_matrix));
    if(staple == NULL){
      printf("%s(%d): Can't malloc temporary\n",myname,this_node);
      terminate(1);
    }

    tempmat1 = (su3_matrix *)special_alloc(sites_on_node*sizeof(su3_matrix));
    if(tempmat1 == NULL){
      printf("%s(%d): Can't malloc temporary\n",myname,this_node);
      terminate(1);
    }

    eb3 = eps*beta/3.0;

    /* Loop over directions, update mom[dir] */
    for(dir=XUP; dir<=TUP; dir++){

	FORALLSITES(i,st)for(j=0;j<3;j++)for(k=0;k<3;k++){
			staple[i].e[j][k]=cmplx(0.0,0.0);
	} END_LOOP

	ncount=0;
	for(iloop=0;iloop<nloop;iloop++){
	    length=loop_length[iloop];
	    for(ln=0;ln<loop_num[iloop];ln++){
/**printf("UPD:  "); printpath( loop_table[iloop][ln], length );**/
		/* set up dirs.  we are looking at loop starting in "XUP"
		   direction, rotate so it starts in "dir" direction. */
		for(k=0;k<length;k++){
                    if( GOES_FORWARDS(loop_table[iloop][ln][k]) ){
                	dirs[k]=(dir+loop_table[iloop][ln][k] )% 4;
		    }
            	    else {
                        dirs[k]=OPP_DIR(
			    (dir+OPP_DIR(loop_table[iloop][ln][k]))%4 );
		    }
		}

		path_length= length-1;  /* generalized "staple" */

		/* check for links in direction of momentum to be
		   updated, each such link gives a contribution. Note
		   the direction of the path - opposite the link. */
		for(k=0;k<length;k++)if( dirs[k]==dir||dirs[k]==OPP_DIR(dir)) {
		    if( GOES_FORWARDS(dirs[k]) ) for(j=0;j<path_length;j++) {
			path_dir[j] = dirs[(k+j+1)%length];
		    }
		    if( GOES_BACKWARDS(dirs[k]) ) for(j=0;j<path_length;j++) {
			path_dir[path_length-1-j] =
			    OPP_DIR(dirs[(k+j+1)%length]);
		    }
/**if(dir==XUP)printf("X_UPDATE PATH: "); printpath( path_dir, path_length );**/
		    path_product(path_dir,path_length, tempmat1);

		    /* We took the path in the other direction from our
			old convention in order to get it to end up
			"at our site", so now take adjoint */
		    /* then compute "single_action" contribution to
			staple */
		    FORALLSITES(i,st){
			su3_adjoint( &(tempmat1[i]), &tmat1 );
			/* first we compute the fundamental term */
			new_term = loop_coeff[iloop][0];

			/* now we add in the higher representations */
			if(nreps > 1){
node0_printf("WARNING: THIS CODE IS NOT TESTED\n"); exit(0);
			    act2=1.0;
			    action = 3.0 - realtrace_su3(&(st->link[dir]),
				&tmat1 ); 

			    for(j=1;j<nreps;j++){
				act2 *= action;
				new_term +=
				    loop_coeff[iloop][j]*act2*(Real)(j+1);
			    }
			}  /* end if nreps > 1 */

			scalar_mult_add_su3_matrix( &(staple[i]), &tmat1,
				new_term, &(staple[i]) );

		    } END_LOOP

		    ncount++;

		} /* k (location in path) */
	    } /* ln */
	} /* iloop */

	/* Now multiply the staple sum by the link, then update momentum */
	FORALLSITES(i,st){
	    mult_su3_na( &(st->link[dir]), &(staple[i]), &tmat1 );
	    momentum = (anti_hermitmat *)F_PT(st,mom_off);
	    uncompress_anti_hermitian( &momentum[dir], &tmat2 );
	    scalar_mult_sub_su3_matrix( &tmat2, &tmat1,
		eb3, &(staple[i]) );
	    make_anti_hermitian( &(staple[i]), &momentum[dir] );
	} END_LOOP