void deriv_Sb_D_psi(spinor * const l, spinor * const k, hamiltonian_field_t * const hf, const double factor) { int ix,iy, iz; int ioff,ioff2,icx,icy, icz; su3 * restrict up ALIGN; su3 * restrict um ALIGN; su3adj * restrict ddd; static su3adj der; static su3 v1,v2; static su3_vector psia,psib,phia,phib; static spinor rr; spinor * restrict r ALIGN; spinor * restrict sp ALIGN; spinor * restrict sm ALIGN; /* We have 32 registers available */ double _Complex reg00, reg01, reg02, reg03, reg04, reg05; double _Complex reg10, reg11, reg12, reg13, reg14, reg15; /* For su3 matrix, use reg00 for missing register */ double _Complex v00, v01, v02, v10, v11, v12, v20, v21; /* The following contains the left spinor (12 regs) and the final */ /* su3 matrix to trace over */ double _Complex r00, r01, r02, r10, r11, r12, r20, r21, r22, r30, r31, r32; #ifdef _KOJAK_INST # pragma pomp inst begin(derivSb) #endif #pragma disjoint(*r, *sp, *sm, *up, *um, *ddd) __alignx(16, l); __alignx(16, k); if(ieo==0) { ioff=0; } else { ioff=(VOLUME+RAND)/2; } ioff2=(VOLUME+RAND)/2-ioff; /* for parallelization */ #ifdef MPI xchange_field(k, ieo); xchange_field(l, (ieo+1)%2); #endif /************** loop over all lattice sites ****************/ ix=ioff; iy=g_iup[ix][0]; icy=iy; sp = k + icy; _prefetch_spinor(sp); up=&hf->gaugefield[ix][0]; _prefetch_su3(up); for(icx = ioff; icx < (VOLUME+ioff); icx++){ /* load left vector r and */ /* multiply with gamma5 */ r = l + (icx-ioff); ix=icx; /*********************** direction +0 ********************/ ddd = &hf->derivative[ix][0]; _bgl_load_r0((*r).s0); _bgl_load_r1((*r).s1); _bgl_load_minus_r2((*r).s2); _bgl_load_minus_r3((*r).s3); _bgl_load_reg0((*sp).s0); _bgl_load_reg0_up((*sp).s1); _bgl_load_reg1((*sp).s2); _bgl_load_reg1_up((*sp).s3); _bgl_add_to_reg0_reg1(); _bgl_add_to_reg0_up_reg1_up(); _bgl_add_r0_to_r2_reg1(); _bgl_add_r1_to_r3_reg1_up(); iy=g_idn[ix][0]; icy=iy; sm = k + icy; _prefetch_spinor(sm); um=&hf->gaugefield[iy][0]; _prefetch_su3(um); _bgl_tensor_product_and_add(); /* result in v now */ _bgl_su3_times_v_dagger(*up); /* result in r now */ _bgl_complex_times_r(ka0); _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); /************** direction -0 ****************************/ ddd = &hf->derivative[iy][0]; _bgl_load_r0((*r).s0); _bgl_load_r1((*r).s1); _bgl_load_minus_r2((*r).s2); _bgl_load_minus_r3((*r).s3); _bgl_load_reg0((*sm).s0); _bgl_load_reg0_up((*sm).s1); _bgl_load_reg1((*sm).s2); _bgl_load_reg1_up((*sm).s3); _bgl_sub_from_reg0_reg1(); _bgl_sub_from_reg0_up_reg1_up(); _bgl_sub_from_r0_r2_reg1(); _bgl_sub_from_r1_r3_reg1_up(); iy=g_iup[ix][1]; icy=[iy]; sp = k + icy; _prefetch_spinor(sp); up=&hf->gaugefield[ix][1]; _prefetch_su3(up); _bgl_tensor_product_and_add_d(); /* result in v now */ _bgl_su3_times_v_dagger(*um); /* result in r now */ _bgl_complex_times_r(ka0); _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); /*************** direction +1 **************************/ ddd = &hf->derivative[ix][1]; _bgl_load_r0((*r).s0); _bgl_load_r1((*r).s1); _bgl_load_minus_r2((*r).s2); _bgl_load_minus_r3((*r).s3); _bgl_load_reg0((*sp).s0); _bgl_load_reg0_up((*sp).s1); _bgl_load_reg1((*sp).s2); _bgl_load_reg1_up((*sp).s3); _bgl_i_mul_add_to_reg0_reg1_up(); _bgl_i_mul_add_to_reg0_up_reg1(); _bgl_i_mul_add_r0_to_r3_reg1(); _bgl_i_mul_add_r1_to_r2_reg1_up(); iy=g_idn[ix][1]; icy=iy; sm = k + icy; _prefetch_spinor(sm); um=&hf->gaugefield[iy][1]; _prefetch_su3(um); _bgl_tensor_product_and_add(); /* result in v now */ _bgl_su3_times_v_dagger(*up); /* result in r now */ _bgl_complex_times_r(ka1); _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); /**************** direction -1 *************************/ ddd = &hf->derivative[iy][1]; _bgl_load_r0((*r).s0); _bgl_load_r1((*r).s1); _bgl_load_minus_r2((*r).s2); _bgl_load_minus_r3((*r).s3); _bgl_load_reg0((*sp).s0); _bgl_load_reg0_up((*sp).s1); _bgl_load_reg1((*sp).s2); _bgl_load_reg1_up((*sp).s3); _bgl_i_mul_sub_from_reg0_reg1_up(); _bgl_i_mul_sub_from_reg0_up_reg1(); _bgl_i_mul_sub_from_r0_r3_reg1(); _bgl_i_mul_sub_from_r1_r2_reg1_up(); iy=g_iup[ix][2]; icy=iy; sp = k + icy; _prefetch_spinor(sp); up=&hf->gaugefield[ix][2]; _prefetch_su3(up); _bgl_tensor_product_and_add_d(); /* result in v now */ _bgl_su3_times_v_dagger(*um); /* result in r now */ _bgl_complex_times_r(ka1); _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); /*************** direction +2 **************************/ ddd = &hf->derivative[ix][2]; _bgl_load_r0((*r).s0); _bgl_load_r1((*r).s1); _bgl_load_minus_r2((*r).s2); _bgl_load_minus_r3((*r).s3); _bgl_load_reg0((*sp).s0); _bgl_load_reg0_up((*sp).s1); _bgl_load_reg1((*sp).s2); _bgl_load_reg1_up((*sp).s3); _bgl_add_to_reg0_reg1_up(); _bgl_sub_from_reg0_up_reg1(); _bgl_add_r0_to_r3_reg1(); _bgl_sub_from_r1_r2_reg1_up(); iy=g_idn[ix][2]; icy=iy; sm = k + icy; _prefetch_spinor(sm); um=&hf->gaugefield[iy][2]; _prefetch_su3(um); _bgl_tensor_product_and_add(); /* result in v now */ _bgl_su3_times_v_dagger(*up); /* result in r now */ _bgl_complex_times_r(ka2); _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); /***************** direction -2 ************************/ ddd = &hf->derivative[iy][2]; _bgl_load_r0((*r).s0); _bgl_load_r1((*r).s1); _bgl_load_minus_r2((*r).s2); _bgl_load_minus_r3((*r).s3); _bgl_load_reg0((*sp).s0); _bgl_load_reg0_up((*sp).s1); _bgl_load_reg1((*sp).s2); _bgl_load_reg1_up((*sp).s3); _bgl_sub_from_reg0_reg1_up(); _bgl_add_to_reg0_up_reg1(); _bgl_sub_from_r0_r3_reg1(); _bgl_add_r1_to_r2_reg1_up(); iy=g_iup[ix][3]; icy=iy; sp = k + icy; _prefetch_spinor(sp); up=&hf->gaugefield[ix][3]; _prefetch_su3(up); _bgl_tensor_product_and_add_d(); /* result in v now */ _bgl_su3_times_v_dagger(*um); /* result in r now */ _bgl_complex_times_r(ka1); _bgl_trace_lambda_mul_add_assign(*ddd, 2.*factor); /****************** direction +3 ***********************/ ddd = &hf->derivative[ix][3]; _bgl_load_r0((*r).s0); _bgl_load_r1((*r).s1); _bgl_load_minus_r2((*r).s2); _bgl_load_minus_r3((*r).s3); _bgl_load_reg0((*sp).s0); _bgl_load_reg0_up((*sp).s1); _bgl_load_reg1((*sp).s2); _bgl_load_reg1_up((*sp).s3); _bgl_i_mul_add_to_reg0_reg1(); _bgl_i_mul_sub_from_reg0_up_reg1_up(); _bgl_i_mul_add_r0_to_r2_reg1(); _bgl_i_mul_sub_from_r1_r3_reg1_up(); iy=g_idn[ix][3]; icy=iy; sm = k + icy; _prefetch_spinor(sm); um=&hf->gaugefield[iy][3]; _prefetch_su3(um); _bgl_tensor_product_and_add(); /* result in v now */ _bgl_su3_times_v_dagger(*up); /* result in r now */ _bgl_complex_times_r(ka3); _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); /***************** direction -3 ************************/ ddd = &hf->derivative[iy][3]; _bgl_load_r0((*r).s0); _bgl_load_r1((*r).s1); _bgl_load_minus_r2((*r).s2); _bgl_load_minus_r3((*r).s3); _bgl_load_reg0((*sp).s0); _bgl_load_reg0_up((*sp).s1); _bgl_load_reg1((*sp).s2); _bgl_load_reg1_up((*sp).s3); _bgl_i_mul_sub_from_reg0_reg1(); _bgl_i_mul_add_to_reg0_up_reg1_up(); _bgl_i_mul_sub_from_r0_r2_reg1(); _bgl_i_mul_add_r1_to_r3_reg1_up(); /* something wrong here...*/ icz=icx+1; if(icz==((VOLUME+RAND)/2+ioff)) icz=ioff; iz=icz; iy=g_iup[iz][0]; icy=iy; sp = k + icy; _prefetch_spinor(sp); up=&hf->gaugefield[iz][0]; _prefetch_su3(up); _bgl_tensor_product_and_add_d(); /* result in v now */ _bgl_su3_times_v_dagger(*um); /* result in r now */ _bgl_complex_times_r(ka3); _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); /****************** end of loop ************************/ } #ifdef _KOJAK_INST #pragma pomp inst end(derivSb) #endif }
/* this is the hopping part only */ void local_H(spinor * const rr, spinor * const s, su3 * u, int * _idx) { int * idx = _idx; su3 * restrict up ALIGN; su3 * restrict um ALIGN; spinor * restrict sp ALIGN; spinor * restrict sm ALIGN; #pragma disjoint(*s, *sp, *sm, *rr, *up, *um) __alignx(16,rr); __alignx(16,s); /*********************** direction +0 ************************/ up = u; sp = (spinor *) s + (*idx); idx++; um = up+1; _prefetch_su3(um); sm = (spinor *) s + (*idx); _prefetch_spinor(sm); idx++; _bgl_load_reg0(sp->s0); _bgl_load_reg1(sp->s1); _bgl_load_reg0_up(sp->s2); _bgl_load_reg1_up(sp->s3); _bgl_vector_add_reg0(); _bgl_vector_add_reg1(); /* result is now in regx0, regx1, regx2 x = 0,1 */ _bgl_su3_multiply_double((*up)); _bgl_vector_cmplx_mul_double(phase_0); _bgl_add_to_rs0_reg0(); _bgl_add_to_rs2_reg0(); _bgl_add_to_rs1_reg1(); _bgl_add_to_rs3_reg1(); /*********************** direction -0 ************************/ up = um+1; _prefetch_su3(up); sp = (spinor*) s + (*idx); _prefetch_spinor(sp); idx++; _bgl_load_reg0(sm->s0); _bgl_load_reg1(sm->s1); _bgl_load_reg0_up(sm->s2); _bgl_load_reg1_up(sm->s3); _bgl_vector_sub_reg0(); _bgl_vector_sub_reg1(); _bgl_su3_inverse_multiply_double((*um)); _bgl_vector_cmplxcg_mul_double(phase_0); _bgl_add_to_rs0_reg0(); _bgl_sub_from_rs2_reg0(); _bgl_add_to_rs1_reg1(); _bgl_sub_from_rs3_reg1(); /*********************** direction +1 ************************/ um = up+1; _prefetch_su3(um); sm = (spinor*) s + (*idx); _prefetch_spinor(sm); idx++; _bgl_load_reg0(sp->s0); _bgl_load_reg1(sp->s1); _bgl_load_reg0_up(sp->s3); _bgl_load_reg1_up(sp->s2); _bgl_vector_i_mul_add_reg0(); _bgl_vector_i_mul_add_reg1(); _bgl_su3_multiply_double((*up)); _bgl_vector_cmplx_mul_double(phase_1); _bgl_add_to_rs0_reg0(); _bgl_i_mul_sub_from_rs3_reg0(); _bgl_add_to_rs1_reg1(); _bgl_i_mul_sub_from_rs2_reg1(); /*********************** direction -1 ************************/ up = um+1; _prefetch_su3(up); sp = (spinor*) s + (*idx); _prefetch_spinor(sp); idx++; _bgl_load_reg0(sm->s0); _bgl_load_reg1(sm->s1); _bgl_load_reg0_up(sm->s3); _bgl_load_reg1_up(sm->s2); _bgl_vector_i_mul_sub_reg0(); _bgl_vector_i_mul_sub_reg1(); _bgl_su3_inverse_multiply_double((*um)); _bgl_vector_cmplxcg_mul_double(phase_1); _bgl_add_to_rs0_reg0(); _bgl_add_to_rs1_reg1(); _bgl_i_mul_add_to_rs3_reg0(); _bgl_i_mul_add_to_rs2_reg1(); /*********************** direction +2 ************************/ um = up+1; _prefetch_su3(um); sm = (spinor*) s + (*idx); _prefetch_spinor(sm); idx++; _bgl_load_reg0(sp->s0); _bgl_load_reg1(sp->s1); _bgl_load_reg1_up(sp->s2); _bgl_load_reg0_up(sp->s3); _bgl_vector_add_reg0(); _bgl_vector_sub_reg1(); _bgl_su3_multiply_double((*up)); _bgl_vector_cmplx_mul_double(phase_2); _bgl_add_to_rs0_reg0(); _bgl_add_to_rs1_reg1(); _bgl_sub_from_rs2_reg1(); _bgl_add_to_rs3_reg0(); /*********************** direction -2 ************************/ up = um+1; _prefetch_su3(up); sp = (spinor*) s + (*idx); _prefetch_spinor(sp); idx++; _bgl_load_reg0(sm->s0); _bgl_load_reg1(sm->s1); _bgl_load_reg1_up(sm->s2); _bgl_load_reg0_up(sm->s3); _bgl_vector_sub_reg0(); _bgl_vector_add_reg1(); _bgl_su3_inverse_multiply_double((*um)); _bgl_vector_cmplxcg_mul_double(phase_2); _bgl_add_to_rs0_reg0(); _bgl_add_to_rs1_reg1(); _bgl_add_to_rs2_reg1(); _bgl_sub_from_rs3_reg0(); /*********************** direction +3 ************************/ um = up+1; _prefetch_su3(um); sm = (spinor*) s + (*idx); _prefetch_spinor(sm); _bgl_load_reg0(sp->s0); _bgl_load_reg1(sp->s1); _bgl_load_reg0_up(sp->s2); _bgl_load_reg1_up(sp->s3); _bgl_vector_i_mul_add_reg0(); _bgl_vector_i_mul_sub_reg1(); _bgl_su3_multiply_double((*up)); _bgl_vector_cmplx_mul_double(phase_3); _bgl_add_to_rs0_reg0(); _bgl_add_to_rs1_reg1(); _bgl_i_mul_sub_from_rs2_reg0(); _bgl_i_mul_add_to_rs3_reg1(); /*********************** direction -3 ************************/ _bgl_load_reg0(sm->s0); _bgl_load_reg1(sm->s1); _bgl_load_reg0_up(sm->s2); _bgl_load_reg1_up(sm->s3); _bgl_vector_i_mul_sub_reg0(); _bgl_vector_i_mul_add_reg1(); _bgl_su3_inverse_multiply_double((*um)); _bgl_vector_cmplxcg_mul_double(phase_3); _bgl_add_to_rs0_reg0(); _bgl_store_rs0(rr->s0); _bgl_i_mul_add_to_rs2_reg0(); _bgl_store_rs2(rr->s2); _bgl_add_to_rs1_reg1(); _bgl_store_rs1(rr->s1); _bgl_i_mul_sub_from_rs3_reg1(); _bgl_store_rs3(rr->s3); }