Exemplo n.º 1
0
void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){
  int ix, i;
  su3 * restrict U ALIGN;
  static spinor rs;
  spinor * restrict s ALIGN;
  halfspinor ** phi ALIGN;
#if defined OPTERON
  const int predist=2;
#else
  const int predist=1;
#endif
#ifdef _KOJAK_INST
#pragma pomp inst begin(hoppingmatrix)
#endif

#ifdef _GAUGE_COPY
  if(g_update_gauge_copy) {
    update_backward_gauge();
  }
#endif
  /* We will run through the source vector now */
  /* instead of the solution vector            */
  s = k;
  _prefetch_spinor(s);

  if(ieo == 0) {
    U = g_gauge_field_copy[0][0];
  }
  else {
    U = g_gauge_field_copy[1][0];
  }
  phi = NBPointer[ieo];

  _prefetch_su3(U);
  /**************** loop over all lattice sites ******************/
  ix=0;
  for(i = 0; i < (VOLUME)/2; i++){

    /*********************** direction +0 ************************/
    _prefetch_su3(U+predist);

    _sse_load((*s).s0);
    _sse_load_up((*s).s2);
    _sse_vector_add();

    _sse_su3_multiply((*U));
    _sse_vector_cmplx_mul(ka0);
    _sse_store_nt_up((*phi[ix]).s0);

    _sse_load((*s).s1);
    _sse_load_up((*s).s3);
    _sse_vector_add();
      
    _sse_su3_multiply((*U));
    _sse_vector_cmplx_mul(ka0);
    _sse_store_nt_up((*phi[ix]).s1);
    U++;
    ix++;
    /*********************** direction -0 ************************/
    _sse_load((*s).s0);
    _sse_load_up((*s).s2);
    _sse_vector_sub();
    _sse_store_nt((*phi[ix]).s0);

    _sse_load((*s).s1);
    _sse_load_up((*s).s3);
    _sse_vector_sub();
    _sse_store_nt((*phi[ix]).s1);
    ix++;

    /*********************** direction +1 ************************/
    _prefetch_su3(U+predist);

    _sse_load((*s).s0);
    /*next not needed?*/
    _sse_load_up((*s).s3);
    _sse_vector_i_mul();
    _sse_vector_add();

    _sse_su3_multiply((*U));
    _sse_vector_cmplx_mul(ka1);
    _sse_store_nt_up((*phi[ix]).s0);

    _sse_load((*s).s1);
    _sse_load_up((*s).s2);
    _sse_vector_i_mul();
    _sse_vector_add();

    _sse_su3_multiply((*U));
    _sse_vector_cmplx_mul(ka1);
    _sse_store_nt_up((*phi[ix]).s1);
    ix++;
    U++;

    /*********************** direction -1 ************************/
    _sse_load((*s).s0);
    _sse_load_up((*s).s3);
    _sse_vector_i_mul();
    _sse_vector_sub();
    _sse_store_nt((*phi[ix]).s0);

    _sse_load((*s).s1);
    _sse_load_up((*s).s2);
    _sse_vector_i_mul();
    _sse_vector_sub();
    _sse_store_nt((*phi[ix]).s1);
    ix++;

    /*********************** direction +2 ************************/
    _prefetch_su3(U+predist);

    _sse_load((*s).s0);
    _sse_load_up((*s).s3);
    _sse_vector_add();

    _sse_su3_multiply((*U));
    _sse_vector_cmplx_mul(ka2);
    _sse_store_nt_up((*phi[ix]).s0);

    _sse_load((*s).s1);
    _sse_load_up((*s).s2);
    _sse_vector_sub();

    _sse_su3_multiply((*U));
    _sse_vector_cmplx_mul(ka2);
    _sse_store_nt_up((*phi[ix]).s1);
    ix++;
    U++;
    /*********************** direction -2 ************************/
    _sse_load((*s).s0);
    _sse_load_up((*s).s3);
    _sse_vector_sub();
    _sse_store_nt((*phi[ix]).s0);

    _sse_load((*s).s1);
    _sse_load_up((*s).s2);
    _sse_vector_add();
    _sse_store_nt((*phi[ix]).s1);
    ix++;

    /*********************** direction +3 ************************/
    _prefetch_su3(U+predist);
    _prefetch_spinor(s+1);

    _sse_load((*s).s0);
    _sse_load_up((*s).s2);
    _sse_vector_i_mul();
    _sse_vector_add();

    _sse_su3_multiply((*U));
    _sse_vector_cmplx_mul(ka3);
    _sse_store_nt_up((*phi[ix]).s0);

    _sse_load((*s).s1);
    _sse_load_up((*s).s3);
    _sse_vector_i_mul();
    _sse_vector_sub();

    _sse_su3_multiply((*U));
    _sse_vector_cmplx_mul(ka3);
    _sse_store_nt_up((*phi[ix]).s1);
    ix++;
    U++;

    /*********************** direction -3 ************************/
    _sse_load((*s).s0);
    _sse_load_up((*s).s2);
    _sse_vector_i_mul();
    _sse_vector_sub();
    _sse_store_nt((*phi[ix]).s0);

    _sse_load((*s).s1);
    _sse_load_up((*s).s3);
    _sse_vector_i_mul();
    _sse_vector_add();
    _sse_store_nt((*phi[ix]).s1);
    ix++;
    s++;
  }

#    if (defined MPI && !defined _NO_COMM)
  xchange_halffield(); 
#    endif
  s = l;
  phi = NBPointer[2 + ieo];
  if(ieo == 0) {
    U = g_gauge_field_copy[1][0];
  }
  else {
    U = g_gauge_field_copy[0][0];
  }
  _prefetch_su3(U);
  
  /* Now we sum up and expand to a full spinor */
  ix = 0;
  for(i = 0; i < (VOLUME)/2; i++){
    /*********************** direction +0 ************************/
    _vector_assign(rs.s0, (*phi[ix]).s0);
    _vector_assign(rs.s2, (*phi[ix]).s0);
    _vector_assign(rs.s1, (*phi[ix]).s1);
    _vector_assign(rs.s3, (*phi[ix]).s1);
    ix++;

    /*********************** direction -0 ************************/
    _prefetch_su3(U+predist);
      
    _sse_load((*phi[ix]).s0);
    _sse_su3_inverse_multiply((*U));
    _sse_vector_cmplxcg_mul(ka0);
    _sse_load(rs.s0);
    _sse_vector_add();
    _sse_store(rs.s0);

    _sse_load(rs.s2);
    _sse_vector_sub();
    _sse_store(rs.s2);

    _sse_load((*phi[ix]).s1);
    _sse_su3_inverse_multiply((*U));
    _sse_vector_cmplxcg_mul(ka0);

    _sse_load(rs.s1);
    _sse_vector_add();
    _sse_store(rs.s1);

    _sse_load(rs.s3);
    _sse_vector_sub();
    _sse_store(rs.s3);

    ix++;
    U++;
    /*********************** direction +1 ************************/
    _sse_load_up((*phi[ix]).s0);
    _sse_load(rs.s0);
    _sse_vector_add();
    _sse_store(rs.s0);

    _sse_load(rs.s3);
    _sse_vector_i_mul();      
    _sse_vector_sub();
    _sse_store(rs.s3); 

    _sse_load_up((*phi[ix]).s1);
    _sse_load(rs.s1);
    _sse_vector_add();
    _sse_store(rs.s1);

    _sse_load(rs.s2);
    _sse_vector_i_mul();      
    _sse_vector_sub();
    _sse_store(rs.s2);       
    ix++;

    /*********************** direction -1 ************************/

    _prefetch_su3(U+predist);

    _sse_load((*phi[ix]).s0);
    _sse_su3_inverse_multiply((*U));
    _sse_vector_cmplxcg_mul(ka1);

    _sse_load(rs.s0);
    _sse_vector_add();
    _sse_store(rs.s0);

    _sse_load(rs.s3);
    _sse_vector_i_mul();      
    _sse_vector_add();
    _sse_store(rs.s3);

    _sse_load((*phi[ix]).s1);
      
    _sse_su3_inverse_multiply((*U));
    _sse_vector_cmplxcg_mul(ka1);

    _sse_load(rs.s1);
    _sse_vector_add();
    _sse_store(rs.s1);

    _sse_load(rs.s2);
    _sse_vector_i_mul();      
    _sse_vector_add();
    _sse_store(rs.s2);
    ix++;
    U++;

    /*********************** direction +2 ************************/
    _sse_load_up((*phi[ix]).s0);
    _sse_load(rs.s0);
    _sse_vector_add();
    _sse_store(rs.s0);

    _sse_load(rs.s3);
    _sse_vector_add();
    _sse_store(rs.s3);

    _sse_load_up((*phi[ix]).s1);
    _sse_load(rs.s1);
    _sse_vector_add();
    _sse_store(rs.s1);

    _sse_load(rs.s2);
    _sse_vector_sub();
    _sse_store(rs.s2);      
    ix++;

    /*********************** direction -2 ************************/

    _prefetch_su3(U+predist);

    _sse_load((*phi[ix]).s0);
    _sse_su3_inverse_multiply((*U));
    _sse_vector_cmplxcg_mul(ka2);

    _sse_load(rs.s0);
    _sse_vector_add();
    _sse_store(rs.s0);

    _sse_load(rs.s3);
    _sse_vector_sub();
    _sse_store(rs.s3);

    _sse_load((*phi[ix]).s1);
      
    _sse_su3_inverse_multiply((*U));
    _sse_vector_cmplxcg_mul(ka2);

    _sse_load(rs.s1);
    _sse_vector_add();
    _sse_store(rs.s1);

    _sse_load(rs.s2);
    _sse_vector_add();
    _sse_store(rs.s2);      
    ix++;
    U++;
    /*********************** direction +3 ************************/
    _sse_load_up((*phi[ix]).s0);

    _sse_load(rs.s0);
    _sse_vector_add();
    _sse_store(rs.s0);

    _sse_load(rs.s2);
    _sse_vector_i_mul();      
    _sse_vector_sub();
    _sse_store(rs.s2);

    _sse_load_up((*phi[ix]).s1);

    _sse_load(rs.s1);
    _sse_vector_add();
    _sse_store(rs.s1);

    _sse_load(rs.s3);
    _sse_vector_i_mul();      
    _sse_vector_add();
    _sse_store(rs.s3);

    ix++;
    /*********************** direction -3 ************************/

    _prefetch_su3(U+predist); 
    _prefetch_spinor(s+1);

    _sse_load((*phi[ix]).s0);
      
    _sse_su3_inverse_multiply((*U));
    _sse_vector_cmplxcg_mul(ka3);

    _sse_load(rs.s0);
    _sse_vector_add();
    _sse_store_nt((*s).s0);

    _sse_load(rs.s2);
    _sse_vector_i_mul();      
    _sse_vector_add();
    _sse_store_nt((*s).s2);

    _sse_load((*phi[ix]).s1);
      
    _sse_su3_inverse_multiply((*U));
    _sse_vector_cmplxcg_mul(ka3);

    _sse_load(rs.s1);
    _sse_vector_add();
    _sse_store_nt((*s).s1);

    _sse_load(rs.s3);
    _sse_vector_i_mul();      
    _sse_vector_sub();
    _sse_store_nt((*s).s3);
    ix++;
    U++;
    s++;
  }
#ifdef _KOJAK_INST
#pragma pomp inst end(hoppingmatrix)
#endif
}
Exemplo n.º 2
0
void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){
  int ix;
  su3 * restrict ALIGN U;
  spinor * restrict ALIGN s;
  halfspinor * restrict * phi ALIGN;
  halfspinor32 * restrict * phi32 ALIGN;
  /* We have 32 registers available */
  _declare_hregs();

#ifdef _KOJAK_INST
#pragma pomp inst begin(hoppingmatrix)
#endif
#pragma disjoint(*s, *U)

#ifdef _GAUGE_COPY
  if(g_update_gauge_copy) {
    update_backward_gauge(g_gauge_field);
  }
#endif

  __alignx(16, l);
  __alignx(16, k);
  if(g_sloppy_precision == 1 && g_sloppy_precision_flag == 1) {
    __alignx(16, HalfSpinor32);
    /* We will run through the source vector now */
    /* instead of the solution vector            */
    s = k;
    _prefetch_spinor(s); 

    /* s contains the source vector */

    if(ieo == 0) {
      U = g_gauge_field_copy[0][0];
    }
    else {
      U = g_gauge_field_copy[1][0];
    }
    phi32 = NBPointer32[ieo];

    _prefetch_su3(U);
    /**************** loop over all lattice sites ******************/
    ix=0;
    for(int i = 0; i < (VOLUME)/2; i++){
      /*********************** direction +0 ************************/
      _hop_t_p_pre32();
      s++; 
      U++;
      ix++;

      /*********************** direction -0 ************************/
      _hop_t_m_pre32();
      ix++;

      /*********************** direction +1 ************************/
      _hop_x_p_pre32();
      ix++;
      U++;

      /*********************** direction -1 ************************/
      _hop_x_m_pre32();
      ix++;

      /*********************** direction +2 ************************/
      _hop_y_p_pre32();

      ix++;
      U++;

      /*********************** direction -2 ************************/
      _hop_y_m_pre32();
      ix++;

      /*********************** direction +3 ************************/
      _hop_z_p_pre32();
      ix++;
      U++;

      /*********************** direction -3 ************************/
      _hop_z_m_pre32();
      ix++;

      /************************ end of loop ************************/
    }

#    if (defined TM_USE_MPI && !defined _NO_COMM)
    xchange_halffield32(); 
#    endif
    s = l;
    phi32 = NBPointer32[2 + ieo];
    if(ieo == 0) {
      U = g_gauge_field_copy[1][0];
    }
    else {
      U = g_gauge_field_copy[0][0];
    }
    //_prefetch_halfspinor(phi32[0]);
    _prefetch_su3(U);
  
    /* Now we sum up and expand to a full spinor */
    ix = 0;
    /*   _prefetch_spinor_for_store(s); */
    for(int i = 0; i < (VOLUME)/2; i++){
      /* This causes a lot of trouble, do we understand this? */
      /*     _prefetch_spinor_for_store(s); */
      //_prefetch_halfspinor(phi32[ix+1]);
      /*********************** direction +0 ************************/
      _hop_t_p_post32();
      ix++;
      /*********************** direction -0 ************************/
      _hop_t_m_post32();
      U++;
      ix++;
      /*********************** direction +1 ************************/
      _hop_x_p_post32();
      ix++;
      /*********************** direction -1 ************************/
      _hop_x_m_post32();
      U++;
      ix++;
      /*********************** direction +2 ************************/
      _hop_y_p_post32();
      ix++;
      /*********************** direction -2 ************************/
      _hop_y_m_post32();
      U++;
      ix++;
      /*********************** direction +3 ************************/
      _hop_z_p_post32();
      ix++;
      /*********************** direction -3 ************************/
      _hop_z_m_post32();
      U++;
      ix++;
      s++;
    }
  }
  else {
    __alignx(16, HalfSpinor);
    /* We will run through the source vector now */
    /* instead of the solution vector            */
    s = k;
    _prefetch_spinor(s); 

    /* s contains the source vector */

    if(ieo == 0) {
      U = g_gauge_field_copy[0][0];
    }
    else {
      U = g_gauge_field_copy[1][0];
    }
    phi = NBPointer[ieo];

    _prefetch_su3(U);
    /**************** loop over all lattice sites ******************/
    ix=0;
    for(int i = 0; i < (VOLUME)/2; i++){
      /*********************** direction +0 ************************/
      _hop_t_p_pre();
      s++; 
      U++;
      ix++;

      /*********************** direction -0 ************************/
      _hop_t_m_pre();
      ix++;

      /*********************** direction +1 ************************/
      _hop_x_p_pre();
      ix++;
      U++;

      /*********************** direction -1 ************************/
      _hop_x_m_pre();
      ix++;


      /*********************** direction +2 ************************/
      _hop_y_p_pre();
      ix++;
      U++;

      /*********************** direction -2 ************************/
      _hop_y_m_pre();
      ix++;

      /*********************** direction +3 ************************/
      _hop_z_p_pre();
      ix++;
      U++;

      /*********************** direction -3 ************************/
      _hop_z_m_pre();
      ix++;

      /************************ end of loop ************************/

    }

#    if (defined TM_USE_MPI && !defined _NO_COMM)
    xchange_halffield(); 
#    endif
    s = l;
    phi = NBPointer[2 + ieo];
    //_prefetch_halfspinor(phi[0]);
    if(ieo == 0) {
      U = g_gauge_field_copy[1][0];
    }
    else {
      U = g_gauge_field_copy[0][0];
    }
    _prefetch_su3(U);
  
    /* Now we sum up and expand to a full spinor */
    ix = 0;
    /*   _prefetch_spinor_for_store(s); */
    for(int i = 0; i < (VOLUME)/2; i++){
      /* This causes a lot of trouble, do we understand this? */
      /*     _prefetch_spinor_for_store(s); */
      //_prefetch_halfspinor(phi[ix+1]);
      /*********************** direction +0 ************************/
      _hop_t_p_post();
      ix++;
      /*********************** direction -0 ************************/
      _hop_t_m_post();
      U++;
      ix++;
      /*********************** direction +1 ************************/
      _hop_x_p_post();
      ix++;
      /*********************** direction -1 ************************/
      _hop_x_m_post();
      U++;
      ix++;
      /*********************** direction +2 ************************/
      _hop_y_p_post();
      ix++;
      /*********************** direction -2 ************************/
      _hop_y_m_post();
      U++;
      ix++;
      /*********************** direction +3 ************************/
      _hop_z_p_post();
      ix++;
      /*********************** direction -3 ************************/
      _hop_z_m_post();
      U++;
      ix++;
      s++;
    }
  }
#ifdef _KOJAK_INST
#pragma pomp inst end(hoppingmatrix)
#endif
}
Exemplo n.º 3
0
/* for ieo=0, k resides on  odd sites and l on even sites */
void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){
  int i,ix;
  su3 * restrict U ALIGN;
  spinor * restrict s ALIGN;
  spinor rs;
  static su3_vector psi, chi, psi2, chi2;
  halfspinor * restrict * phi ALIGN;
  halfspinor32 * restrict * phi32 ALIGN;
#ifdef _KOJAK_INST
#pragma pomp inst begin(hoppingmatrix)
#endif
#ifdef XLC
#pragma disjoint(*l, *k, *U, *s)
#endif

#ifdef _GAUGE_COPY
  if(g_update_gauge_copy) {
    update_backward_gauge();
  }
#endif

  if(k == l){
    printf("Error in H_psi (simple.c):\n");
    printf("Arguments k and l must be different\n");
    printf("Program aborted\n");
    exit(1);
  }
  s = k;

  if(ieo == 0) {
    U = g_gauge_field_copy[0][0];
  }
  else {
    U = g_gauge_field_copy[1][0];
  }
  if(g_sloppy_precision == 1 && g_sloppy_precision_flag == 1) {
    phi32 = NBPointer32[ieo];
      
    /**************** loop over all lattice sites ****************/
    ix=0;
    for(i = 0; i < (VOLUME)/2; i++){
      _vector_assign(rs.s0, (*s).s0);
      _vector_assign(rs.s1, (*s).s1);
      _vector_assign(rs.s2, (*s).s2);
      _vector_assign(rs.s3, (*s).s3);
      s++;
      /*********************** direction +0 ************************/
      
      _vector_add(psi, rs.s0, rs.s2);

      _su3_multiply(chi,(*U),psi);
      _complex_times_vector((*phi32[ix]).s0, ka0, chi);
      
      _vector_add(psi, rs.s1, rs.s3);

      _su3_multiply(chi,(*U),psi);
      _complex_times_vector((*phi32[ix]).s1, ka0, chi);
            
      U++;
      ix++;
    
      /*********************** direction -0 ************************/

      _vector_sub((*phi32[ix]).s0, rs.s0, rs.s2);
      _vector_sub((*phi32[ix]).s1, rs.s1, rs.s3);

      ix++;

      /*********************** direction +1 ************************/

      _vector_i_add(psi, rs.s0, rs.s3);

      _su3_multiply(chi, (*U), psi);
      _complex_times_vector((*phi32[ix]).s0, ka1, chi);

      _vector_i_add(psi, rs.s1, rs.s2);

      _su3_multiply(chi, (*U), psi);
      _complex_times_vector((*phi32[ix]).s1, ka1, chi);

      U++;
      ix++;

      /*********************** direction -1 ************************/

      _vector_i_sub((*phi32[ix]).s0, rs.s0, rs.s3);
      _vector_i_sub((*phi32[ix]).s1, rs.s1, rs.s2);

      ix++;
      /*********************** direction +2 ************************/

      _vector_add(psi, rs.s0, rs.s3);

      _su3_multiply(chi,(*U),psi);
      _complex_times_vector((*phi32[ix]).s0, ka2, chi);

      _vector_sub(psi, rs.s1, rs.s2);

      _su3_multiply(chi,(*U),psi);
      _complex_times_vector((*phi32[ix]).s1, ka2, chi);
      
      U++;
      ix++;

      /*********************** direction -2 ************************/

      _vector_sub((*phi32[ix]).s0, rs.s0, rs.s3);
      _vector_add((*phi32[ix]).s1, rs.s1, rs.s2);
      ix++;

      /*********************** direction +3 ************************/

      _vector_i_add(psi, rs.s0, rs.s2);
      
      _su3_multiply(chi, (*U), psi);
      _complex_times_vector((*phi32[ix]).s0, ka3, chi);


      _vector_i_sub(psi, rs.s1, rs.s3);

      _su3_multiply(chi,(*U),psi);
      _complex_times_vector((*phi32[ix]).s1, ka3, chi);

      U++;
      ix++;
      /*********************** direction -3 ************************/

      _vector_i_sub((*phi32[ix]).s0, rs.s0, rs.s2);
      _vector_i_add((*phi32[ix]).s1, rs.s1, rs.s3);

      ix++;
      /************************ end of loop ************************/
    }
#    if (defined MPI && !defined _NO_COMM)
    xchange_halffield32(); 
#    endif
    s = l;
    phi32 = NBPointer32[2 + ieo];
    if(ieo == 0) {
      U = g_gauge_field_copy[1][0];
    }
    else {
      U = g_gauge_field_copy[0][0];
    }

    ix = 0;
    for(i = 0; i < (VOLUME)/2; i++){
      /*********************** direction +0 ************************/
      _vector_assign(rs.s0, (*phi32[ix]).s0);
      _vector_assign(rs.s2, (*phi32[ix]).s0);
      _vector_assign(rs.s1, (*phi32[ix]).s1);
      _vector_assign(rs.s3, (*phi32[ix]).s1);
      ix++;
      /*********************** direction -0 ************************/
      _vector_assign(psi, (*phi32[ix]).s0);
      _su3_inverse_multiply(chi,(*U), psi);
      _complexcjg_times_vector(psi,ka0,chi);

      _vector_add_assign(rs.s0, psi);
      _vector_sub_assign(rs.s2, psi);

      _vector_assign(psi, (*phi32[ix]).s1);
      _su3_inverse_multiply(chi,(*U), psi);
      _complexcjg_times_vector(psi,ka0,chi);
      
      _vector_add_assign(rs.s1, psi);
      _vector_sub_assign(rs.s3, psi);
      ix++;
      U++;
      /*********************** direction +1 ************************/

      _vector_add_assign(rs.s0, (*phi32[ix]).s0);
      _vector_i_sub_assign(rs.s3, (*phi32[ix]).s0);

      _vector_add_assign(rs.s1, (*phi32[ix]).s1);
      _vector_i_sub_assign(rs.s2, (*phi32[ix]).s1);
    
      ix++;
      /*********************** direction -1 ************************/
      _vector_assign(psi, (*phi32[ix]).s0);
      _su3_inverse_multiply(chi,(*U), psi);
      _complexcjg_times_vector(psi,ka1,chi);

      _vector_add_assign(rs.s0, psi);
      _vector_i_add_assign(rs.s3, psi);

      _vector_assign(psi, (*phi32[ix]).s1);
      _su3_inverse_multiply(chi,(*U), psi);
      _complexcjg_times_vector(psi,ka1,chi);

      _vector_add_assign(rs.s1, psi);
      _vector_i_add_assign(rs.s2, psi);

      U++;
      ix++;

      /*********************** direction +2 ************************/

      _vector_add_assign(rs.s0, (*phi32[ix]).s0);
      _vector_add_assign(rs.s3, (*phi32[ix]).s0);

      _vector_add_assign(rs.s1, (*phi32[ix]).s1);
      _vector_sub_assign(rs.s2, (*phi32[ix]).s1);
    
      ix++;
      /*********************** direction -2 ************************/

      _vector_assign(psi, (*phi32[ix]).s0);
      _su3_inverse_multiply(chi,(*U), psi);
      _complexcjg_times_vector(psi,ka2,chi);

      _vector_add_assign(rs.s0, psi);
      _vector_sub_assign(rs.s3, psi);

      _vector_assign(psi, (*phi32[ix]).s1);
      _su3_inverse_multiply(chi, (*U), psi);
      _complexcjg_times_vector(psi,ka2,chi);
      
      _vector_add_assign(rs.s1, psi);
      _vector_add_assign(rs.s2, psi);

      U++;
      ix++;
      /*********************** direction +3 ************************/

      _vector_add_assign(rs.s0, (*phi32[ix]).s0);
      _vector_i_sub_assign(rs.s2, (*phi32[ix]).s0);

      _vector_add_assign(rs.s1, (*phi32[ix]).s1);
      _vector_i_add_assign(rs.s3, (*phi32[ix]).s1);

      ix++;

      /*********************** direction -3 ************************/

      _vector_assign(psi, (*phi32[ix]).s0);
      _su3_inverse_multiply(chi,(*U), psi);
      _complexcjg_times_vector(psi,ka3,chi);
      
      _vector_add((*s).s0, rs.s0, psi);
      _vector_i_add((*s).s2, rs.s2, psi);

      _vector_assign(psi, (*phi32[ix]).s1);
      _su3_inverse_multiply(chi,(*U), psi);
      _complexcjg_times_vector(psi,ka3,chi);

      _vector_add((*s).s1, rs.s1, psi);
      _vector_i_sub((*s).s3, rs.s3, psi);

      U++;
      ix++;
      s++;
    }
  }
  else {
    phi = NBPointer[ieo];
      
    /**************** loop over all lattice sites ****************/
    ix=0;
    /* #pragma ivdep*/
    for(i = 0; i < (VOLUME)/2; i++){
      _vector_assign(rs.s0, (*s).s0);
      _vector_assign(rs.s1, (*s).s1);
      _vector_assign(rs.s2, (*s).s2);
      _vector_assign(rs.s3, (*s).s3);
      s++;
      /*********************** direction +0 ************************/
      
      _vector_add(psi, rs.s0, rs.s2);
      _vector_add(psi2, rs.s1, rs.s3);
      _su3_multiply(chi,(*U),psi);
      _su3_multiply(chi2,(*U),psi2);
      _complex_times_vector((*phi[ix]).s0, ka0, chi);
      _complex_times_vector((*phi[ix]).s1, ka0, chi2);
            
      U++;
      ix++;
    
      /*********************** direction -0 ************************/

      _vector_sub((*phi[ix]).s0, rs.s0, rs.s2);
      _vector_sub((*phi[ix]).s1, rs.s1, rs.s3);

      ix++;

      /*********************** direction +1 ************************/

      _vector_i_add(psi, rs.s0, rs.s3);
      _vector_i_add(psi2, rs.s1, rs.s2);
      _su3_multiply(chi, (*U), psi);
      _su3_multiply(chi2, (*U), psi2);
      _complex_times_vector((*phi[ix]).s0, ka1, chi);
      _complex_times_vector((*phi[ix]).s1, ka1, chi2);

      U++;
      ix++;

      /*********************** direction -1 ************************/

      _vector_i_sub((*phi[ix]).s0, rs.s0, rs.s3);
      _vector_i_sub((*phi[ix]).s1, rs.s1, rs.s2);

      ix++;
      /*********************** direction +2 ************************/

      _vector_add(psi, rs.s0, rs.s3);
      _vector_sub(psi2, rs.s1, rs.s2);
      _su3_multiply(chi,(*U),psi);
      _su3_multiply(chi2,(*U),psi2);
      _complex_times_vector((*phi[ix]).s0, ka2, chi);
      _complex_times_vector((*phi[ix]).s1, ka2, chi2);
      
      U++;
      ix++;

      /*********************** direction -2 ************************/

      _vector_sub((*phi[ix]).s0, rs.s0, rs.s3);
      _vector_add((*phi[ix]).s1, rs.s1, rs.s2);
      ix++;

      /*********************** direction +3 ************************/

      _vector_i_add(psi, rs.s0, rs.s2);
      _vector_i_sub(psi2, rs.s1, rs.s3);      
      _su3_multiply(chi, (*U), psi);
      _su3_multiply(chi2,(*U),psi2);
      _complex_times_vector((*phi[ix]).s0, ka3, chi);
      _complex_times_vector((*phi[ix]).s1, ka3, chi2);

      U++;
      ix++;
      /*********************** direction -3 ************************/

      _vector_i_sub((*phi[ix]).s0, rs.s0, rs.s2);
      _vector_i_add((*phi[ix]).s1, rs.s1, rs.s3);

      ix++;
      /************************ end of loop ************************/
    }
#    if (defined MPI && !defined _NO_COMM)
    xchange_halffield(); 
#    endif
    s = l;
    phi = NBPointer[2 + ieo];
    if(ieo == 0) {
      U = g_gauge_field_copy[1][0];
    }
    else {
      U = g_gauge_field_copy[0][0];
    }

    ix = 0;
    /* #pragma ivdep */
    for(i = 0; i < (VOLUME)/2; i++){
      /*********************** direction +0 ************************/
      _vector_assign(rs.s0, (*phi[ix]).s0);
      _vector_assign(rs.s2, (*phi[ix]).s0);
      _vector_assign(rs.s1, (*phi[ix]).s1);
      _vector_assign(rs.s3, (*phi[ix]).s1);
      ix++;
      /*********************** direction -0 ************************/
      _su3_inverse_multiply(chi,(*U),(*phi[ix]).s0);
      _su3_inverse_multiply(chi2,(*U),(*phi[ix]).s1);
      _complexcjg_times_vector(psi,ka0,chi);
      _complexcjg_times_vector(psi2,ka0,chi2);
      _vector_add_assign(rs.s0, psi);
      _vector_sub_assign(rs.s2, psi);
      _vector_add_assign(rs.s1, psi2);
      _vector_sub_assign(rs.s3, psi2);
      ix++;
      U++;
      /*********************** direction +1 ************************/

      _vector_add_assign(rs.s0, (*phi[ix]).s0);
      _vector_i_sub_assign(rs.s3, (*phi[ix]).s0);

      _vector_add_assign(rs.s1, (*phi[ix]).s1);
      _vector_i_sub_assign(rs.s2, (*phi[ix]).s1);
    
      ix++;
      /*********************** direction -1 ************************/

      _su3_inverse_multiply(chi,(*U), (*phi[ix]).s0);
      _su3_inverse_multiply(chi2, (*U), (*phi[ix]).s1);
      _complexcjg_times_vector(psi,ka1,chi);
      _complexcjg_times_vector(psi2,ka1,chi2);
      _vector_add_assign(rs.s0, psi);
      _vector_i_add_assign(rs.s3, psi);
      _vector_add_assign(rs.s1, psi2);
      _vector_i_add_assign(rs.s2, psi2);

      U++;
      ix++;

      /*********************** direction +2 ************************/

      _vector_add_assign(rs.s0, (*phi[ix]).s0);
      _vector_add_assign(rs.s3, (*phi[ix]).s0);

      _vector_add_assign(rs.s1, (*phi[ix]).s1);
      _vector_sub_assign(rs.s2, (*phi[ix]).s1);
    
      ix++;
      /*********************** direction -2 ************************/

      _su3_inverse_multiply(chi,(*U), (*phi[ix]).s0);
      _su3_inverse_multiply(chi2, (*U), (*phi[ix]).s1);
      _complexcjg_times_vector(psi,ka2,chi);
      _complexcjg_times_vector(psi2,ka2,chi2);
      _vector_add_assign(rs.s0, psi);
      _vector_sub_assign(rs.s3, psi);
      _vector_add_assign(rs.s1, psi2);
      _vector_add_assign(rs.s2, psi2);

      U++;
      ix++;
      /*********************** direction +3 ************************/

      _vector_add_assign(rs.s0, (*phi[ix]).s0);
      _vector_i_sub_assign(rs.s2, (*phi[ix]).s0);

      _vector_add_assign(rs.s1, (*phi[ix]).s1);
      _vector_i_add_assign(rs.s3, (*phi[ix]).s1);

      ix++;

      /*********************** direction -3 ************************/

      _su3_inverse_multiply(chi,(*U), (*phi[ix]).s0);
      _su3_inverse_multiply(chi2, (*U), (*phi[ix]).s1);
      _complexcjg_times_vector(psi,ka3,chi);
      _complexcjg_times_vector(psi2,ka3,chi2);      
      _vector_add((*s).s0, rs.s0, psi);
      _vector_i_add((*s).s2, rs.s2, psi);
      _vector_add((*s).s1, rs.s1, psi2);
      _vector_i_sub((*s).s3, rs.s3, psi2);

      U++;
      ix++;
      s++;
    }
  }
#ifdef _KOJAK_INST
#pragma pomp inst end(hoppingmatrix)
#endif
}