void deriv_Sb_D_psi(spinor * const l, spinor * const k, hamiltonian_field_t * const hf, const double factor) { int ix,iy, iz; int ioff,ioff2,icx,icy, icz; su3 * restrict up ALIGN; su3 * restrict um ALIGN; su3adj * restrict ddd; static su3adj der; static su3 v1,v2; static su3_vector psia,psib,phia,phib; static spinor rr; spinor * restrict r ALIGN; spinor * restrict sp ALIGN; spinor * restrict sm ALIGN; /* We have 32 registers available */ double _Complex reg00, reg01, reg02, reg03, reg04, reg05; double _Complex reg10, reg11, reg12, reg13, reg14, reg15; /* For su3 matrix, use reg00 for missing register */ double _Complex v00, v01, v02, v10, v11, v12, v20, v21; /* The following contains the left spinor (12 regs) and the final */ /* su3 matrix to trace over */ double _Complex r00, r01, r02, r10, r11, r12, r20, r21, r22, r30, r31, r32; #ifdef _KOJAK_INST # pragma pomp inst begin(derivSb) #endif #pragma disjoint(*r, *sp, *sm, *up, *um, *ddd) __alignx(16, l); __alignx(16, k); if(ieo==0) { ioff=0; } else { ioff=(VOLUME+RAND)/2; } ioff2=(VOLUME+RAND)/2-ioff; /* for parallelization */ #ifdef MPI xchange_field(k, ieo); xchange_field(l, (ieo+1)%2); #endif /************** loop over all lattice sites ****************/ ix=ioff; iy=g_iup[ix][0]; icy=iy; sp = k + icy; _prefetch_spinor(sp); up=&hf->gaugefield[ix][0]; _prefetch_su3(up); for(icx = ioff; icx < (VOLUME+ioff); icx++){ /* load left vector r and */ /* multiply with gamma5 */ r = l + (icx-ioff); ix=icx; /*********************** direction +0 ********************/ ddd = &hf->derivative[ix][0]; _bgl_load_r0((*r).s0); _bgl_load_r1((*r).s1); _bgl_load_minus_r2((*r).s2); _bgl_load_minus_r3((*r).s3); _bgl_load_reg0((*sp).s0); _bgl_load_reg0_up((*sp).s1); _bgl_load_reg1((*sp).s2); _bgl_load_reg1_up((*sp).s3); _bgl_add_to_reg0_reg1(); _bgl_add_to_reg0_up_reg1_up(); _bgl_add_r0_to_r2_reg1(); _bgl_add_r1_to_r3_reg1_up(); iy=g_idn[ix][0]; icy=iy; sm = k + icy; _prefetch_spinor(sm); um=&hf->gaugefield[iy][0]; _prefetch_su3(um); _bgl_tensor_product_and_add(); /* result in v now */ _bgl_su3_times_v_dagger(*up); /* result in r now */ _bgl_complex_times_r(ka0); _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); /************** direction -0 ****************************/ ddd = &hf->derivative[iy][0]; _bgl_load_r0((*r).s0); _bgl_load_r1((*r).s1); _bgl_load_minus_r2((*r).s2); _bgl_load_minus_r3((*r).s3); _bgl_load_reg0((*sm).s0); _bgl_load_reg0_up((*sm).s1); _bgl_load_reg1((*sm).s2); _bgl_load_reg1_up((*sm).s3); _bgl_sub_from_reg0_reg1(); _bgl_sub_from_reg0_up_reg1_up(); _bgl_sub_from_r0_r2_reg1(); _bgl_sub_from_r1_r3_reg1_up(); iy=g_iup[ix][1]; icy=[iy]; sp = k + icy; _prefetch_spinor(sp); up=&hf->gaugefield[ix][1]; _prefetch_su3(up); _bgl_tensor_product_and_add_d(); /* result in v now */ _bgl_su3_times_v_dagger(*um); /* result in r now */ _bgl_complex_times_r(ka0); _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); /*************** direction +1 **************************/ ddd = &hf->derivative[ix][1]; _bgl_load_r0((*r).s0); _bgl_load_r1((*r).s1); _bgl_load_minus_r2((*r).s2); _bgl_load_minus_r3((*r).s3); _bgl_load_reg0((*sp).s0); _bgl_load_reg0_up((*sp).s1); _bgl_load_reg1((*sp).s2); _bgl_load_reg1_up((*sp).s3); _bgl_i_mul_add_to_reg0_reg1_up(); _bgl_i_mul_add_to_reg0_up_reg1(); _bgl_i_mul_add_r0_to_r3_reg1(); _bgl_i_mul_add_r1_to_r2_reg1_up(); iy=g_idn[ix][1]; icy=iy; sm = k + icy; _prefetch_spinor(sm); um=&hf->gaugefield[iy][1]; _prefetch_su3(um); _bgl_tensor_product_and_add(); /* result in v now */ _bgl_su3_times_v_dagger(*up); /* result in r now */ _bgl_complex_times_r(ka1); _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); /**************** direction -1 *************************/ ddd = &hf->derivative[iy][1]; _bgl_load_r0((*r).s0); _bgl_load_r1((*r).s1); _bgl_load_minus_r2((*r).s2); _bgl_load_minus_r3((*r).s3); _bgl_load_reg0((*sp).s0); _bgl_load_reg0_up((*sp).s1); _bgl_load_reg1((*sp).s2); _bgl_load_reg1_up((*sp).s3); _bgl_i_mul_sub_from_reg0_reg1_up(); _bgl_i_mul_sub_from_reg0_up_reg1(); _bgl_i_mul_sub_from_r0_r3_reg1(); _bgl_i_mul_sub_from_r1_r2_reg1_up(); iy=g_iup[ix][2]; icy=iy; sp = k + icy; _prefetch_spinor(sp); up=&hf->gaugefield[ix][2]; _prefetch_su3(up); _bgl_tensor_product_and_add_d(); /* result in v now */ _bgl_su3_times_v_dagger(*um); /* result in r now */ _bgl_complex_times_r(ka1); _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); /*************** direction +2 **************************/ ddd = &hf->derivative[ix][2]; _bgl_load_r0((*r).s0); _bgl_load_r1((*r).s1); _bgl_load_minus_r2((*r).s2); _bgl_load_minus_r3((*r).s3); _bgl_load_reg0((*sp).s0); _bgl_load_reg0_up((*sp).s1); _bgl_load_reg1((*sp).s2); _bgl_load_reg1_up((*sp).s3); _bgl_add_to_reg0_reg1_up(); _bgl_sub_from_reg0_up_reg1(); _bgl_add_r0_to_r3_reg1(); _bgl_sub_from_r1_r2_reg1_up(); iy=g_idn[ix][2]; icy=iy; sm = k + icy; _prefetch_spinor(sm); um=&hf->gaugefield[iy][2]; _prefetch_su3(um); _bgl_tensor_product_and_add(); /* result in v now */ _bgl_su3_times_v_dagger(*up); /* result in r now */ _bgl_complex_times_r(ka2); _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); /***************** direction -2 ************************/ ddd = &hf->derivative[iy][2]; _bgl_load_r0((*r).s0); _bgl_load_r1((*r).s1); _bgl_load_minus_r2((*r).s2); _bgl_load_minus_r3((*r).s3); _bgl_load_reg0((*sp).s0); _bgl_load_reg0_up((*sp).s1); _bgl_load_reg1((*sp).s2); _bgl_load_reg1_up((*sp).s3); _bgl_sub_from_reg0_reg1_up(); _bgl_add_to_reg0_up_reg1(); _bgl_sub_from_r0_r3_reg1(); _bgl_add_r1_to_r2_reg1_up(); iy=g_iup[ix][3]; icy=iy; sp = k + icy; _prefetch_spinor(sp); up=&hf->gaugefield[ix][3]; _prefetch_su3(up); _bgl_tensor_product_and_add_d(); /* result in v now */ _bgl_su3_times_v_dagger(*um); /* result in r now */ _bgl_complex_times_r(ka1); _bgl_trace_lambda_mul_add_assign(*ddd, 2.*factor); /****************** direction +3 ***********************/ ddd = &hf->derivative[ix][3]; _bgl_load_r0((*r).s0); _bgl_load_r1((*r).s1); _bgl_load_minus_r2((*r).s2); _bgl_load_minus_r3((*r).s3); _bgl_load_reg0((*sp).s0); _bgl_load_reg0_up((*sp).s1); _bgl_load_reg1((*sp).s2); _bgl_load_reg1_up((*sp).s3); _bgl_i_mul_add_to_reg0_reg1(); _bgl_i_mul_sub_from_reg0_up_reg1_up(); _bgl_i_mul_add_r0_to_r2_reg1(); _bgl_i_mul_sub_from_r1_r3_reg1_up(); iy=g_idn[ix][3]; icy=iy; sm = k + icy; _prefetch_spinor(sm); um=&hf->gaugefield[iy][3]; _prefetch_su3(um); _bgl_tensor_product_and_add(); /* result in v now */ _bgl_su3_times_v_dagger(*up); /* result in r now */ _bgl_complex_times_r(ka3); _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); /***************** direction -3 ************************/ ddd = &hf->derivative[iy][3]; _bgl_load_r0((*r).s0); _bgl_load_r1((*r).s1); _bgl_load_minus_r2((*r).s2); _bgl_load_minus_r3((*r).s3); _bgl_load_reg0((*sp).s0); _bgl_load_reg0_up((*sp).s1); _bgl_load_reg1((*sp).s2); _bgl_load_reg1_up((*sp).s3); _bgl_i_mul_sub_from_reg0_reg1(); _bgl_i_mul_add_to_reg0_up_reg1_up(); _bgl_i_mul_sub_from_r0_r2_reg1(); _bgl_i_mul_add_r1_to_r3_reg1_up(); /* something wrong here...*/ icz=icx+1; if(icz==((VOLUME+RAND)/2+ioff)) icz=ioff; iz=icz; iy=g_iup[iz][0]; icy=iy; sp = k + icy; _prefetch_spinor(sp); up=&hf->gaugefield[iz][0]; _prefetch_su3(up); _bgl_tensor_product_and_add_d(); /* result in v now */ _bgl_su3_times_v_dagger(*um); /* result in r now */ _bgl_complex_times_r(ka3); _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); /****************** end of loop ************************/ } #ifdef _KOJAK_INST #pragma pomp inst end(derivSb) #endif }
/* Serially Checked ! */ void Dtm_psi(spinor * const P, spinor * const Q){ if(P==Q){ printf("Error in Dtm_psi (D_psi.c):\n"); printf("Arguments must be differen spinor fields\n"); printf("Program aborted\n"); exit(1); } #ifdef _GAUGE_COPY2 if(g_update_gauge_copy) { update_backward_gauge(g_gauge_field); } #endif # if defined TM_USE_MPI xchange_lexicfield(Q); # endif #ifdef TM_USE_OMP #pragma omp parallel { #endif int ix,iy,iz; su3 *up,*um; spinor *s,*sp,*sm,*rn; _Complex double fact1, fact2; spinor rs __attribute__ ((aligned (16))); fact1 = 1. + g_mu * I; fact2 = conj(fact1); #ifndef TM_USE_OMP iy=g_iup[0][0]; sp=(spinor *) Q + iy; up=&g_gauge_field[0][0]; #endif /************************ loop over all lattice sites *************************/ #ifdef TM_USE_OMP #pragma omp for #endif for (ix=0;ix<VOLUME;ix++){ #ifdef TM_USE_OMP iy=g_iup[ix][0]; up=&g_gauge_field[ix][0]; sp=(spinor *) Q + iy; #endif s=(spinor *) Q + ix; _prefetch_spinor(s); /******************************* direction +0 *********************************/ iy=g_idn[ix][0]; sm = (spinor *) Q + iy; _prefetch_spinor(sm); _sse_load(sp->s0); _sse_load_up(sp->s2); _sse_vector_add(); _sse_su3_multiply((*up)); _sse_vector_cmplx_mul(phase_0); _sse_store_up(rs.s2); // the diagonal bit _sse_load_up(s->s0); _sse_vector_cmplx_mul(fact1); _sse_load(rs.s2); _sse_vector_add(); _sse_store(rs.s0); // g5 in the twisted term _sse_load_up(s->s2); _sse_vector_cmplx_mul(fact2); _sse_load(rs.s2); _sse_vector_add(); _sse_store(rs.s2); um=&g_gauge_field[iy][0]; _prefetch_su3(um); _sse_load(sp->s1); _sse_load_up(sp->s3); _sse_vector_add(); _sse_su3_multiply((*up)); _sse_vector_cmplx_mul(phase_0); _sse_store_up(rs.s3); // the diagonal bit _sse_load_up(s->s1); _sse_vector_cmplx_mul(fact1); _sse_load(rs.s3); _sse_vector_add(); _sse_store(rs.s1); // g5 in the twisted term _sse_load_up(s->s3); _sse_vector_cmplx_mul(fact2); _sse_load(rs.s3); _sse_vector_add(); _sse_store(rs.s3); /******************************* direction -0 *********************************/ iy=g_iup[ix][1]; sp = (spinor *) Q + iy; _prefetch_spinor(sp); _sse_load(sm->s0); _sse_load_up(sm->s2); _sse_vector_sub(); _sse_su3_inverse_multiply((*um)); _sse_vector_cmplxcg_mul(phase_0); _sse_load(rs.s0); _sse_vector_add(); _sse_store(rs.s0); _sse_load(rs.s2); _sse_vector_sub(); _sse_store(rs.s2); up+=1; _prefetch_su3(up); _sse_load(sm->s1); _sse_load_up(sm->s3); _sse_vector_sub(); _sse_su3_inverse_multiply((*um)); _sse_vector_cmplxcg_mul(phase_0); _sse_load(rs.s1); _sse_vector_add(); _sse_store(rs.s1); _sse_load(rs.s3); _sse_vector_sub(); _sse_store(rs.s3); /******************************* direction +1 *********************************/ iy=g_idn[ix][1]; sm = (spinor *) Q + iy; _prefetch_spinor(sm); _sse_load(sp->s0); _sse_load_up(sp->s3); _sse_vector_i_mul(); _sse_vector_add(); _sse_su3_multiply((*up)); _sse_vector_cmplx_mul(phase_1); _sse_load(rs.s0); _sse_vector_add(); _sse_store(rs.s0); _sse_load(rs.s3); _sse_vector_i_mul(); _sse_vector_sub(); _sse_store(rs.s3); um=&g_gauge_field[iy][1]; _prefetch_su3(um); _sse_load(sp->s1); _sse_load_up(sp->s2); _sse_vector_i_mul(); _sse_vector_add(); _sse_su3_multiply((*up)); _sse_vector_cmplx_mul(phase_1); _sse_load(rs.s1); _sse_vector_add(); _sse_store(rs.s1); _sse_load(rs.s2); _sse_vector_i_mul(); _sse_vector_sub(); _sse_store(rs.s2); /******************************* direction -1 *********************************/ iy=g_iup[ix][2]; sp = (spinor *) Q + iy; _prefetch_spinor(sp); _sse_load(sm->s0); _sse_load_up(sm->s3); _sse_vector_i_mul(); _sse_vector_sub(); _sse_su3_inverse_multiply((*um)); _sse_vector_cmplxcg_mul(phase_1); _sse_load(rs.s0); _sse_vector_add(); _sse_store(rs.s0); _sse_load(rs.s3); _sse_vector_i_mul(); _sse_vector_add(); _sse_store(rs.s3); up+=1; _prefetch_su3(up); _sse_load(sm->s1); _sse_load_up(sm->s2); _sse_vector_i_mul(); _sse_vector_sub(); _sse_su3_inverse_multiply((*um)); _sse_vector_cmplxcg_mul(phase_1); _sse_load(rs.s1); _sse_vector_add(); _sse_store(rs.s1); _sse_load(rs.s2); _sse_vector_i_mul(); _sse_vector_add(); _sse_store(rs.s2); /******************************* direction +2 *********************************/ iy=g_idn[ix][2]; sm = (spinor *) Q + iy; _prefetch_spinor(sm); _sse_load(sp->s0); _sse_load_up(sp->s3); _sse_vector_add(); _sse_su3_multiply((*up)); _sse_vector_cmplx_mul(phase_2); _sse_load(rs.s0); _sse_vector_add(); _sse_store(rs.s0); _sse_load(rs.s3); _sse_vector_add(); _sse_store(rs.s3); um=&g_gauge_field[iy][2]; _prefetch_su3(um); _sse_load(sp->s1); _sse_load_up(sp->s2); _sse_vector_sub(); _sse_su3_multiply((*up)); _sse_vector_cmplx_mul(phase_2); _sse_load(rs.s1); _sse_vector_add(); _sse_store(rs.s1); _sse_load(rs.s2); _sse_vector_sub(); _sse_store(rs.s2); /******************************* direction -2 *********************************/ iy=g_iup[ix][3]; sp = (spinor *) Q + iy; _prefetch_spinor(sp); _sse_load(sm->s0); _sse_load_up(sm->s3); _sse_vector_sub(); _sse_su3_inverse_multiply((*um)); _sse_vector_cmplxcg_mul(phase_2); _sse_load(rs.s0); _sse_vector_add(); _sse_store(rs.s0); _sse_load(rs.s3); _sse_vector_sub(); _sse_store(rs.s3); up+=1; _prefetch_su3(up); _sse_load(sm->s1); _sse_load_up(sm->s2); _sse_vector_add(); _sse_su3_inverse_multiply((*um)); _sse_vector_cmplxcg_mul(phase_2); _sse_load(rs.s1); _sse_vector_add(); _sse_store(rs.s1); _sse_load(rs.s2); _sse_vector_add(); _sse_store(rs.s2); /******************************* direction +3 *********************************/ iy=g_idn[ix][3]; sm = (spinor *) Q + iy; _prefetch_spinor(sm); _sse_load(sp->s0); _sse_load_up(sp->s2); _sse_vector_i_mul(); _sse_vector_add(); _sse_su3_multiply((*up)); _sse_vector_cmplx_mul(phase_3); _sse_load(rs.s0); _sse_vector_add(); _sse_store(rs.s0); _sse_load(rs.s2); _sse_vector_i_mul(); _sse_vector_sub(); _sse_store(rs.s2); um=&g_gauge_field[iy][3]; _prefetch_su3(um); _sse_load(sp->s1); _sse_load_up(sp->s3); _sse_vector_i_mul(); _sse_vector_sub(); _sse_su3_multiply((*up)); _sse_vector_cmplx_mul(phase_3); _sse_load(rs.s1); _sse_vector_add(); _sse_store(rs.s1); _sse_load(rs.s3); _sse_vector_i_mul(); _sse_vector_add(); _sse_store(rs.s3); /******************************* direction -3 *********************************/ iz=(ix+1+VOLUME)%VOLUME; iy=g_iup[iz][0]; sp = (spinor *) Q + iy; _prefetch_spinor(sp); _sse_load(sm->s0); _sse_load_up(sm->s2); _sse_vector_i_mul(); _sse_vector_sub(); _sse_su3_inverse_multiply((*um)); _sse_vector_cmplxcg_mul(phase_3); rn = (spinor *) P + ix; _sse_load(rs.s0); _sse_vector_add(); _sse_store_nt(rn->s0); _sse_load(rs.s2); _sse_vector_i_mul(); _sse_vector_add(); _sse_store_nt(rn->s2); up=&g_gauge_field[iz][0]; _prefetch_su3(up); _sse_load(sm->s1); _sse_load_up(sm->s3); _sse_vector_i_mul(); _sse_vector_add(); _sse_su3_inverse_multiply((*um)); _sse_vector_cmplxcg_mul(phase_3); _sse_load(rs.s1); _sse_vector_add(); _sse_store_nt(rn->s1); _sse_load(rs.s3); _sse_vector_i_mul(); _sse_vector_sub(); _sse_store_nt(rn->s3); /******************************** end of loop *********************************/ } #ifdef TM_USE_OMP } /* OpenMP closing brace */ #endif }
void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){ int ix; su3 * restrict ALIGN U; spinor * restrict ALIGN s; halfspinor * restrict * phi ALIGN; halfspinor32 * restrict * phi32 ALIGN; /* We have 32 registers available */ _declare_hregs(); #ifdef _KOJAK_INST #pragma pomp inst begin(hoppingmatrix) #endif #pragma disjoint(*s, *U) #ifdef _GAUGE_COPY if(g_update_gauge_copy) { update_backward_gauge(g_gauge_field); } #endif __alignx(16, l); __alignx(16, k); if(g_sloppy_precision == 1 && g_sloppy_precision_flag == 1) { __alignx(16, HalfSpinor32); /* We will run through the source vector now */ /* instead of the solution vector */ s = k; _prefetch_spinor(s); /* s contains the source vector */ if(ieo == 0) { U = g_gauge_field_copy[0][0]; } else { U = g_gauge_field_copy[1][0]; } phi32 = NBPointer32[ieo]; _prefetch_su3(U); /**************** loop over all lattice sites ******************/ ix=0; for(int i = 0; i < (VOLUME)/2; i++){ /*********************** direction +0 ************************/ _hop_t_p_pre32(); s++; U++; ix++; /*********************** direction -0 ************************/ _hop_t_m_pre32(); ix++; /*********************** direction +1 ************************/ _hop_x_p_pre32(); ix++; U++; /*********************** direction -1 ************************/ _hop_x_m_pre32(); ix++; /*********************** direction +2 ************************/ _hop_y_p_pre32(); ix++; U++; /*********************** direction -2 ************************/ _hop_y_m_pre32(); ix++; /*********************** direction +3 ************************/ _hop_z_p_pre32(); ix++; U++; /*********************** direction -3 ************************/ _hop_z_m_pre32(); ix++; /************************ end of loop ************************/ } # if (defined TM_USE_MPI && !defined _NO_COMM) xchange_halffield32(); # endif s = l; phi32 = NBPointer32[2 + ieo]; if(ieo == 0) { U = g_gauge_field_copy[1][0]; } else { U = g_gauge_field_copy[0][0]; } //_prefetch_halfspinor(phi32[0]); _prefetch_su3(U); /* Now we sum up and expand to a full spinor */ ix = 0; /* _prefetch_spinor_for_store(s); */ for(int i = 0; i < (VOLUME)/2; i++){ /* This causes a lot of trouble, do we understand this? */ /* _prefetch_spinor_for_store(s); */ //_prefetch_halfspinor(phi32[ix+1]); /*********************** direction +0 ************************/ _hop_t_p_post32(); ix++; /*********************** direction -0 ************************/ _hop_t_m_post32(); U++; ix++; /*********************** direction +1 ************************/ _hop_x_p_post32(); ix++; /*********************** direction -1 ************************/ _hop_x_m_post32(); U++; ix++; /*********************** direction +2 ************************/ _hop_y_p_post32(); ix++; /*********************** direction -2 ************************/ _hop_y_m_post32(); U++; ix++; /*********************** direction +3 ************************/ _hop_z_p_post32(); ix++; /*********************** direction -3 ************************/ _hop_z_m_post32(); U++; ix++; s++; } } else { __alignx(16, HalfSpinor); /* We will run through the source vector now */ /* instead of the solution vector */ s = k; _prefetch_spinor(s); /* s contains the source vector */ if(ieo == 0) { U = g_gauge_field_copy[0][0]; } else { U = g_gauge_field_copy[1][0]; } phi = NBPointer[ieo]; _prefetch_su3(U); /**************** loop over all lattice sites ******************/ ix=0; for(int i = 0; i < (VOLUME)/2; i++){ /*********************** direction +0 ************************/ _hop_t_p_pre(); s++; U++; ix++; /*********************** direction -0 ************************/ _hop_t_m_pre(); ix++; /*********************** direction +1 ************************/ _hop_x_p_pre(); ix++; U++; /*********************** direction -1 ************************/ _hop_x_m_pre(); ix++; /*********************** direction +2 ************************/ _hop_y_p_pre(); ix++; U++; /*********************** direction -2 ************************/ _hop_y_m_pre(); ix++; /*********************** direction +3 ************************/ _hop_z_p_pre(); ix++; U++; /*********************** direction -3 ************************/ _hop_z_m_pre(); ix++; /************************ end of loop ************************/ } # if (defined TM_USE_MPI && !defined _NO_COMM) xchange_halffield(); # endif s = l; phi = NBPointer[2 + ieo]; //_prefetch_halfspinor(phi[0]); if(ieo == 0) { U = g_gauge_field_copy[1][0]; } else { U = g_gauge_field_copy[0][0]; } _prefetch_su3(U); /* Now we sum up and expand to a full spinor */ ix = 0; /* _prefetch_spinor_for_store(s); */ for(int i = 0; i < (VOLUME)/2; i++){ /* This causes a lot of trouble, do we understand this? */ /* _prefetch_spinor_for_store(s); */ //_prefetch_halfspinor(phi[ix+1]); /*********************** direction +0 ************************/ _hop_t_p_post(); ix++; /*********************** direction -0 ************************/ _hop_t_m_post(); U++; ix++; /*********************** direction +1 ************************/ _hop_x_p_post(); ix++; /*********************** direction -1 ************************/ _hop_x_m_post(); U++; ix++; /*********************** direction +2 ************************/ _hop_y_p_post(); ix++; /*********************** direction -2 ************************/ _hop_y_m_post(); U++; ix++; /*********************** direction +3 ************************/ _hop_z_p_post(); ix++; /*********************** direction -3 ************************/ _hop_z_m_post(); U++; ix++; s++; } } #ifdef _KOJAK_INST #pragma pomp inst end(hoppingmatrix) #endif }
/* this is the hopping part only */ void local_H(spinor * const rr, spinor * const s, su3 * u, int * _idx) { int * idx = _idx; su3 * restrict up ALIGN; su3 * restrict um ALIGN; spinor * restrict sp ALIGN; spinor * restrict sm ALIGN; #pragma disjoint(*s, *sp, *sm, *rr, *up, *um) __alignx(16,rr); __alignx(16,s); /*********************** direction +0 ************************/ up = u; sp = (spinor *) s + (*idx); idx++; um = up+1; _prefetch_su3(um); sm = (spinor *) s + (*idx); _prefetch_spinor(sm); idx++; _bgl_load_reg0(sp->s0); _bgl_load_reg1(sp->s1); _bgl_load_reg0_up(sp->s2); _bgl_load_reg1_up(sp->s3); _bgl_vector_add_reg0(); _bgl_vector_add_reg1(); /* result is now in regx0, regx1, regx2 x = 0,1 */ _bgl_su3_multiply_double((*up)); _bgl_vector_cmplx_mul_double(phase_0); _bgl_add_to_rs0_reg0(); _bgl_add_to_rs2_reg0(); _bgl_add_to_rs1_reg1(); _bgl_add_to_rs3_reg1(); /*********************** direction -0 ************************/ up = um+1; _prefetch_su3(up); sp = (spinor*) s + (*idx); _prefetch_spinor(sp); idx++; _bgl_load_reg0(sm->s0); _bgl_load_reg1(sm->s1); _bgl_load_reg0_up(sm->s2); _bgl_load_reg1_up(sm->s3); _bgl_vector_sub_reg0(); _bgl_vector_sub_reg1(); _bgl_su3_inverse_multiply_double((*um)); _bgl_vector_cmplxcg_mul_double(phase_0); _bgl_add_to_rs0_reg0(); _bgl_sub_from_rs2_reg0(); _bgl_add_to_rs1_reg1(); _bgl_sub_from_rs3_reg1(); /*********************** direction +1 ************************/ um = up+1; _prefetch_su3(um); sm = (spinor*) s + (*idx); _prefetch_spinor(sm); idx++; _bgl_load_reg0(sp->s0); _bgl_load_reg1(sp->s1); _bgl_load_reg0_up(sp->s3); _bgl_load_reg1_up(sp->s2); _bgl_vector_i_mul_add_reg0(); _bgl_vector_i_mul_add_reg1(); _bgl_su3_multiply_double((*up)); _bgl_vector_cmplx_mul_double(phase_1); _bgl_add_to_rs0_reg0(); _bgl_i_mul_sub_from_rs3_reg0(); _bgl_add_to_rs1_reg1(); _bgl_i_mul_sub_from_rs2_reg1(); /*********************** direction -1 ************************/ up = um+1; _prefetch_su3(up); sp = (spinor*) s + (*idx); _prefetch_spinor(sp); idx++; _bgl_load_reg0(sm->s0); _bgl_load_reg1(sm->s1); _bgl_load_reg0_up(sm->s3); _bgl_load_reg1_up(sm->s2); _bgl_vector_i_mul_sub_reg0(); _bgl_vector_i_mul_sub_reg1(); _bgl_su3_inverse_multiply_double((*um)); _bgl_vector_cmplxcg_mul_double(phase_1); _bgl_add_to_rs0_reg0(); _bgl_add_to_rs1_reg1(); _bgl_i_mul_add_to_rs3_reg0(); _bgl_i_mul_add_to_rs2_reg1(); /*********************** direction +2 ************************/ um = up+1; _prefetch_su3(um); sm = (spinor*) s + (*idx); _prefetch_spinor(sm); idx++; _bgl_load_reg0(sp->s0); _bgl_load_reg1(sp->s1); _bgl_load_reg1_up(sp->s2); _bgl_load_reg0_up(sp->s3); _bgl_vector_add_reg0(); _bgl_vector_sub_reg1(); _bgl_su3_multiply_double((*up)); _bgl_vector_cmplx_mul_double(phase_2); _bgl_add_to_rs0_reg0(); _bgl_add_to_rs1_reg1(); _bgl_sub_from_rs2_reg1(); _bgl_add_to_rs3_reg0(); /*********************** direction -2 ************************/ up = um+1; _prefetch_su3(up); sp = (spinor*) s + (*idx); _prefetch_spinor(sp); idx++; _bgl_load_reg0(sm->s0); _bgl_load_reg1(sm->s1); _bgl_load_reg1_up(sm->s2); _bgl_load_reg0_up(sm->s3); _bgl_vector_sub_reg0(); _bgl_vector_add_reg1(); _bgl_su3_inverse_multiply_double((*um)); _bgl_vector_cmplxcg_mul_double(phase_2); _bgl_add_to_rs0_reg0(); _bgl_add_to_rs1_reg1(); _bgl_add_to_rs2_reg1(); _bgl_sub_from_rs3_reg0(); /*********************** direction +3 ************************/ um = up+1; _prefetch_su3(um); sm = (spinor*) s + (*idx); _prefetch_spinor(sm); _bgl_load_reg0(sp->s0); _bgl_load_reg1(sp->s1); _bgl_load_reg0_up(sp->s2); _bgl_load_reg1_up(sp->s3); _bgl_vector_i_mul_add_reg0(); _bgl_vector_i_mul_sub_reg1(); _bgl_su3_multiply_double((*up)); _bgl_vector_cmplx_mul_double(phase_3); _bgl_add_to_rs0_reg0(); _bgl_add_to_rs1_reg1(); _bgl_i_mul_sub_from_rs2_reg0(); _bgl_i_mul_add_to_rs3_reg1(); /*********************** direction -3 ************************/ _bgl_load_reg0(sm->s0); _bgl_load_reg1(sm->s1); _bgl_load_reg0_up(sm->s2); _bgl_load_reg1_up(sm->s3); _bgl_vector_i_mul_sub_reg0(); _bgl_vector_i_mul_add_reg1(); _bgl_su3_inverse_multiply_double((*um)); _bgl_vector_cmplxcg_mul_double(phase_3); _bgl_add_to_rs0_reg0(); _bgl_store_rs0(rr->s0); _bgl_i_mul_add_to_rs2_reg0(); _bgl_store_rs2(rr->s2); _bgl_add_to_rs1_reg1(); _bgl_store_rs1(rr->s1); _bgl_i_mul_sub_from_rs3_reg1(); _bgl_store_rs3(rr->s3); }
void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){ int ix, i; su3 * restrict U ALIGN; static spinor rs; spinor * restrict s ALIGN; halfspinor ** phi ALIGN; #if defined OPTERON const int predist=2; #else const int predist=1; #endif #ifdef _KOJAK_INST #pragma pomp inst begin(hoppingmatrix) #endif #ifdef _GAUGE_COPY if(g_update_gauge_copy) { update_backward_gauge(); } #endif /* We will run through the source vector now */ /* instead of the solution vector */ s = k; _prefetch_spinor(s); if(ieo == 0) { U = g_gauge_field_copy[0][0]; } else { U = g_gauge_field_copy[1][0]; } phi = NBPointer[ieo]; _prefetch_su3(U); /**************** loop over all lattice sites ******************/ ix=0; for(i = 0; i < (VOLUME)/2; i++){ /*********************** direction +0 ************************/ _prefetch_su3(U+predist); _sse_load((*s).s0); _sse_load_up((*s).s2); _sse_vector_add(); _sse_su3_multiply((*U)); _sse_vector_cmplx_mul(ka0); _sse_store_nt_up((*phi[ix]).s0); _sse_load((*s).s1); _sse_load_up((*s).s3); _sse_vector_add(); _sse_su3_multiply((*U)); _sse_vector_cmplx_mul(ka0); _sse_store_nt_up((*phi[ix]).s1); U++; ix++; /*********************** direction -0 ************************/ _sse_load((*s).s0); _sse_load_up((*s).s2); _sse_vector_sub(); _sse_store_nt((*phi[ix]).s0); _sse_load((*s).s1); _sse_load_up((*s).s3); _sse_vector_sub(); _sse_store_nt((*phi[ix]).s1); ix++; /*********************** direction +1 ************************/ _prefetch_su3(U+predist); _sse_load((*s).s0); /*next not needed?*/ _sse_load_up((*s).s3); _sse_vector_i_mul(); _sse_vector_add(); _sse_su3_multiply((*U)); _sse_vector_cmplx_mul(ka1); _sse_store_nt_up((*phi[ix]).s0); _sse_load((*s).s1); _sse_load_up((*s).s2); _sse_vector_i_mul(); _sse_vector_add(); _sse_su3_multiply((*U)); _sse_vector_cmplx_mul(ka1); _sse_store_nt_up((*phi[ix]).s1); ix++; U++; /*********************** direction -1 ************************/ _sse_load((*s).s0); _sse_load_up((*s).s3); _sse_vector_i_mul(); _sse_vector_sub(); _sse_store_nt((*phi[ix]).s0); _sse_load((*s).s1); _sse_load_up((*s).s2); _sse_vector_i_mul(); _sse_vector_sub(); _sse_store_nt((*phi[ix]).s1); ix++; /*********************** direction +2 ************************/ _prefetch_su3(U+predist); _sse_load((*s).s0); _sse_load_up((*s).s3); _sse_vector_add(); _sse_su3_multiply((*U)); _sse_vector_cmplx_mul(ka2); _sse_store_nt_up((*phi[ix]).s0); _sse_load((*s).s1); _sse_load_up((*s).s2); _sse_vector_sub(); _sse_su3_multiply((*U)); _sse_vector_cmplx_mul(ka2); _sse_store_nt_up((*phi[ix]).s1); ix++; U++; /*********************** direction -2 ************************/ _sse_load((*s).s0); _sse_load_up((*s).s3); _sse_vector_sub(); _sse_store_nt((*phi[ix]).s0); _sse_load((*s).s1); _sse_load_up((*s).s2); _sse_vector_add(); _sse_store_nt((*phi[ix]).s1); ix++; /*********************** direction +3 ************************/ _prefetch_su3(U+predist); _prefetch_spinor(s+1); _sse_load((*s).s0); _sse_load_up((*s).s2); _sse_vector_i_mul(); _sse_vector_add(); _sse_su3_multiply((*U)); _sse_vector_cmplx_mul(ka3); _sse_store_nt_up((*phi[ix]).s0); _sse_load((*s).s1); _sse_load_up((*s).s3); _sse_vector_i_mul(); _sse_vector_sub(); _sse_su3_multiply((*U)); _sse_vector_cmplx_mul(ka3); _sse_store_nt_up((*phi[ix]).s1); ix++; U++; /*********************** direction -3 ************************/ _sse_load((*s).s0); _sse_load_up((*s).s2); _sse_vector_i_mul(); _sse_vector_sub(); _sse_store_nt((*phi[ix]).s0); _sse_load((*s).s1); _sse_load_up((*s).s3); _sse_vector_i_mul(); _sse_vector_add(); _sse_store_nt((*phi[ix]).s1); ix++; s++; } # if (defined MPI && !defined _NO_COMM) xchange_halffield(); # endif s = l; phi = NBPointer[2 + ieo]; if(ieo == 0) { U = g_gauge_field_copy[1][0]; } else { U = g_gauge_field_copy[0][0]; } _prefetch_su3(U); /* Now we sum up and expand to a full spinor */ ix = 0; for(i = 0; i < (VOLUME)/2; i++){ /*********************** direction +0 ************************/ _vector_assign(rs.s0, (*phi[ix]).s0); _vector_assign(rs.s2, (*phi[ix]).s0); _vector_assign(rs.s1, (*phi[ix]).s1); _vector_assign(rs.s3, (*phi[ix]).s1); ix++; /*********************** direction -0 ************************/ _prefetch_su3(U+predist); _sse_load((*phi[ix]).s0); _sse_su3_inverse_multiply((*U)); _sse_vector_cmplxcg_mul(ka0); _sse_load(rs.s0); _sse_vector_add(); _sse_store(rs.s0); _sse_load(rs.s2); _sse_vector_sub(); _sse_store(rs.s2); _sse_load((*phi[ix]).s1); _sse_su3_inverse_multiply((*U)); _sse_vector_cmplxcg_mul(ka0); _sse_load(rs.s1); _sse_vector_add(); _sse_store(rs.s1); _sse_load(rs.s3); _sse_vector_sub(); _sse_store(rs.s3); ix++; U++; /*********************** direction +1 ************************/ _sse_load_up((*phi[ix]).s0); _sse_load(rs.s0); _sse_vector_add(); _sse_store(rs.s0); _sse_load(rs.s3); _sse_vector_i_mul(); _sse_vector_sub(); _sse_store(rs.s3); _sse_load_up((*phi[ix]).s1); _sse_load(rs.s1); _sse_vector_add(); _sse_store(rs.s1); _sse_load(rs.s2); _sse_vector_i_mul(); _sse_vector_sub(); _sse_store(rs.s2); ix++; /*********************** direction -1 ************************/ _prefetch_su3(U+predist); _sse_load((*phi[ix]).s0); _sse_su3_inverse_multiply((*U)); _sse_vector_cmplxcg_mul(ka1); _sse_load(rs.s0); _sse_vector_add(); _sse_store(rs.s0); _sse_load(rs.s3); _sse_vector_i_mul(); _sse_vector_add(); _sse_store(rs.s3); _sse_load((*phi[ix]).s1); _sse_su3_inverse_multiply((*U)); _sse_vector_cmplxcg_mul(ka1); _sse_load(rs.s1); _sse_vector_add(); _sse_store(rs.s1); _sse_load(rs.s2); _sse_vector_i_mul(); _sse_vector_add(); _sse_store(rs.s2); ix++; U++; /*********************** direction +2 ************************/ _sse_load_up((*phi[ix]).s0); _sse_load(rs.s0); _sse_vector_add(); _sse_store(rs.s0); _sse_load(rs.s3); _sse_vector_add(); _sse_store(rs.s3); _sse_load_up((*phi[ix]).s1); _sse_load(rs.s1); _sse_vector_add(); _sse_store(rs.s1); _sse_load(rs.s2); _sse_vector_sub(); _sse_store(rs.s2); ix++; /*********************** direction -2 ************************/ _prefetch_su3(U+predist); _sse_load((*phi[ix]).s0); _sse_su3_inverse_multiply((*U)); _sse_vector_cmplxcg_mul(ka2); _sse_load(rs.s0); _sse_vector_add(); _sse_store(rs.s0); _sse_load(rs.s3); _sse_vector_sub(); _sse_store(rs.s3); _sse_load((*phi[ix]).s1); _sse_su3_inverse_multiply((*U)); _sse_vector_cmplxcg_mul(ka2); _sse_load(rs.s1); _sse_vector_add(); _sse_store(rs.s1); _sse_load(rs.s2); _sse_vector_add(); _sse_store(rs.s2); ix++; U++; /*********************** direction +3 ************************/ _sse_load_up((*phi[ix]).s0); _sse_load(rs.s0); _sse_vector_add(); _sse_store(rs.s0); _sse_load(rs.s2); _sse_vector_i_mul(); _sse_vector_sub(); _sse_store(rs.s2); _sse_load_up((*phi[ix]).s1); _sse_load(rs.s1); _sse_vector_add(); _sse_store(rs.s1); _sse_load(rs.s3); _sse_vector_i_mul(); _sse_vector_add(); _sse_store(rs.s3); ix++; /*********************** direction -3 ************************/ _prefetch_su3(U+predist); _prefetch_spinor(s+1); _sse_load((*phi[ix]).s0); _sse_su3_inverse_multiply((*U)); _sse_vector_cmplxcg_mul(ka3); _sse_load(rs.s0); _sse_vector_add(); _sse_store_nt((*s).s0); _sse_load(rs.s2); _sse_vector_i_mul(); _sse_vector_add(); _sse_store_nt((*s).s2); _sse_load((*phi[ix]).s1); _sse_su3_inverse_multiply((*U)); _sse_vector_cmplxcg_mul(ka3); _sse_load(rs.s1); _sse_vector_add(); _sse_store_nt((*s).s1); _sse_load(rs.s3); _sse_vector_i_mul(); _sse_vector_sub(); _sse_store_nt((*s).s3); ix++; U++; s++; } #ifdef _KOJAK_INST #pragma pomp inst end(hoppingmatrix) #endif }