//
// dslashReference()
//
// if oddBit is zero: calculate odd parity spinor elements (using even parity spinor)
// if oddBit is one:  calculate even parity spinor elements
//
// if daggerBit is zero: perform ordinary dslash operator
// if daggerBit is one:  perform hermitian conjugate of dslash
//
void dslashReference(float *res, float **gaugeFull, float *spinorField, int oddBit, int daggerBit) {
    zero(res, Nh*4*3*2);
    
    float *gaugeEven[4], *gaugeOdd[4];
    for (int dir = 0; dir < 4; dir++) {  
        gaugeEven[dir] = gaugeFull[dir];
        gaugeOdd[dir]  = gaugeFull[dir]+Nh*gaugeSiteSize;
    }
    
    for (int i = 0; i < Nh; i++) {
        for (int dir = 0; dir < 8; dir++) {
            float *gauge = gaugeLink(i, dir, oddBit, gaugeEven, gaugeOdd);
            float *spinor = spinorNeighbor(i, dir, oddBit, spinorField);
            
            float projectedSpinor[4*3*2], gaugedSpinor[4*3*2];
            int projIdx = 2*(dir/2)+(dir+daggerBit)%2;
            multiplySpinorByDiracProjector(projectedSpinor, projIdx, spinor);
            
            for (int s = 0; s < 4; s++) {
                if (dir % 2 == 0)
                    su3_mul(&gaugedSpinor[s*(3*2)], gauge, &projectedSpinor[s*(3*2)]);
                else
                    su3_Tmul(&gaugedSpinor[s*(3*2)], gauge, &projectedSpinor[s*(3*2)]);
            }
            
            sum(&res[i*(4*3*2)], &res[i*(4*3*2)], gaugedSpinor, 4*3*2);
        }
    }
}
void SmearJacobireference(sFloat *res, gFloat **gaugeFull, sFloat *spinorField, double r, int steps) {
  sFloat *tmpSpinor = (sFloat *)malloc(V*spinorSiteSize*sizeof(sFloat));
  sFloat *spinorIn, *spinorOut, *tmp;

  //Copy the contents of original spinor into tmpSpinor
  xeqy(tmpSpinor, spinorField, V*spinorSiteSize);
  spinorIn = tmpSpinor;
  spinorOut = res;

  gFloat *gaugeEven[4], *gaugeOdd[4];
  for (int dir = 0; dir < 4; dir++) {  
    gaugeEven[dir] = gaugeFull[dir];
    gaugeOdd[dir]  = gaugeFull[dir]+Vh*gaugeSiteSize;
  }

  for(int iter = 0; iter < steps; iter++) { 

    xeqay(spinorOut, 1/(1+6*r), spinorIn, V*spinorSiteSize);

    for (int oddBit = 0; oddBit < 2; oddBit++) {
      for (int i = 0; i < Vh; i++) {
	int fullindex = oddBit*Vh + i;
	
	//Spatial smearing only
	for (int dir = 0; dir < 6; dir++) {
	  gFloat *gauge = gaugeLink(i, dir, oddBit, gaugeEven, gaugeOdd, 1);
	  sFloat *spinor = spinorNeighborFullLattice(fullindex, dir, spinorIn, 1);
	  sFloat gaugedSpinor[4*3*2];
	  
	  for (int s = 0; s < 4; s++) {
	    if (dir % 2 == 0) su3Mul(&gaugedSpinor[s*(3*2)], gauge, &spinor[s*(3*2)]);
	    else su3Tmul(&gaugedSpinor[s*(3*2)], gauge, &spinor[s*(3*2)]);
	  }
	
	  //Accumulate result from gaugedSpinor into spinorOut
	  xpeqay(&spinorOut[fullindex*(4*3*2)], r/(1+6*r), gaugedSpinor, 4*3*2);
	}
      }
    }

    //Swap the pointers
    tmp = spinorIn;
    spinorIn = spinorOut;
    spinorOut = tmp;
  }

  //Copy the contents of spinorIn into res, unless res already points to
  //spinorIn
  if(spinorIn != res) {
    xeqy(res, spinorIn, V*spinorSiteSize);
  }
  free(tmpSpinor);
}