Beispiel #1
0
void assign_mul_one_pm_imu(spinor * const l, spinor * const k, const double _sign, const int N){
#ifdef OMP
#pragma omp parallel
  {
#endif
  _Complex double z,w;
  int ix;
  double sign = 1.; 
  spinor *r, *s;

  if(_sign < 0.){
    sign = -1.; 
  }

  z = 1. + (sign * g_mu) * I;
  w = conj(z);

  /************ loop over all lattice sites ************/
#ifdef OMP
#pragma omp for
#endif
  for(ix = 0; ix < N; ix++){
    s=l+ix;
    r=k+ix;

    /* Multiply the spinorfield with of 1+imu\gamma_5 */
#if ( defined SSE2 || defined SSE3 )
    _prefetch_spinor((r+predist));
    _prefetch_spinor((s+predist));
    _sse_load_up(r->s0);
    _sse_vector_cmplx_mul(z);
    _sse_store_nt_up(s->s0);
    _sse_load_up(r->s1);
    _sse_vector_cmplx_mul_two();
    _sse_store_nt_up(s->s1);
    _sse_load_up(r->s2);
    _sse_vector_cmplx_mul(w);
    _sse_store_nt_up(s->s2);
    _sse_load_up(r->s3);
    _sse_vector_cmplx_mul_two();
    _sse_store_nt_up(s->s3);
#else
    _complex_times_vector(s->s0, z, r->s0);
    _complex_times_vector(s->s1, z, r->s1);
    _complex_times_vector(s->s2, w, r->s2);
    _complex_times_vector(s->s3, w, r->s3);
#endif
  }
#ifdef OMP
  } /* OpenMP closing brace */
#endif
}
Beispiel #2
0
void mul_one_pm_imu_inv(spinor * const l, const double _sign, const int N){
#ifdef OMP
#pragma omp parallel
  {
#endif
  _Complex double ALIGN z,w;
  int ix;
  double sign=-1.; 
  spinor *r;

  su3_vector ALIGN phi1;

  double ALIGN nrm = 1./(1.+g_mu*g_mu);

  if(_sign < 0.){
    sign = 1.; 
  }

  z = nrm + (sign * nrm * g_mu) * I;
  w = conj(z);
  /************ loop over all lattice sites ************/
#ifdef OMP
#pragma omp for
#endif
  for(ix = 0; ix < N; ix++){
    r=l + ix;
    /* Multiply the spinorfield with the inverse of 1+imu\gamma_5 */
#if ( defined SSE2 || defined SSE3 )
    _prefetch_spinor((r+predist)); 
    _sse_load_up(r->s0);
    _sse_vector_cmplx_mul(z);
    _sse_store_nt_up(r->s0);
    _sse_load_up(r->s1);
    _sse_vector_cmplx_mul_two();
    _sse_store_nt_up(r->s1);
    _sse_load_up(r->s2);
    _sse_vector_cmplx_mul(w);
    _sse_store_nt_up(r->s2);
    _sse_load_up(r->s3);
    _sse_vector_cmplx_mul_two();
    _sse_store_nt_up(r->s3);
#else
    _complex_times_vector(phi1, z, r->s0);
    _vector_assign(r->s0, phi1);
    _complex_times_vector(phi1, z, r->s1);
    _vector_assign(r->s1, phi1);
    _complex_times_vector(phi1, w, r->s2);
    _vector_assign(r->s2, phi1);
    _complex_times_vector(phi1, w, r->s3);
    _vector_assign(r->s3, phi1);
#endif
  }

#ifdef OMP
  } /* OpenMP closing brace */
#endif

}
Beispiel #3
0
void deriv_Sb_D_psi(spinor * const l, spinor * const k, 
		    hamiltonian_field_t * const hf, const double factor) {

  int ix,iy, iz;
  int ioff,ioff2,icx,icy, icz;
  su3 * restrict up ALIGN;
  su3 * restrict um ALIGN;
  su3adj * restrict ddd;
  static su3adj der;
  static su3 v1,v2;
  static su3_vector psia,psib,phia,phib;
  static spinor rr;
  spinor * restrict r ALIGN;
  spinor * restrict sp ALIGN;
  spinor * restrict sm ALIGN;

  /* We have 32 registers available */
  double _Complex reg00, reg01, reg02, reg03, reg04, reg05;
  double _Complex reg10, reg11, reg12, reg13, reg14, reg15;
  /* For su3 matrix, use reg00 for missing register */
  double _Complex v00, v01, v02, v10, v11, v12, v20, v21;
  /* The following contains the left spinor (12 regs) and the final  */
  /* su3 matrix to trace over */
  double _Complex r00, r01, r02, r10, r11, r12, r20, r21, r22, 
    r30, r31, r32;

#ifdef _KOJAK_INST
# pragma pomp inst begin(derivSb)
#endif

#pragma disjoint(*r, *sp, *sm, *up, *um, *ddd)
  __alignx(16, l);
  __alignx(16, k);

  if(ieo==0) {
    ioff=0;
  }
  else {
    ioff=(VOLUME+RAND)/2;
  } 
  ioff2=(VOLUME+RAND)/2-ioff;

  /* for parallelization */
#ifdef MPI
  xchange_field(k, ieo);
  xchange_field(l, (ieo+1)%2);
#endif
  /************** loop over all lattice sites ****************/

  ix=ioff;
  iy=g_iup[ix][0]; icy=iy;
  sp = k + icy;
  _prefetch_spinor(sp);
  up=&hf->gaugefield[ix][0];
  _prefetch_su3(up);

  for(icx = ioff; icx < (VOLUME+ioff); icx++){

    /* load left vector r and */
    /* multiply with gamma5   */
    r = l + (icx-ioff);
    ix=icx;

    /*********************** direction +0 ********************/

    ddd = &hf->derivative[ix][0];
    _bgl_load_r0((*r).s0);
    _bgl_load_r1((*r).s1);
    _bgl_load_minus_r2((*r).s2);
    _bgl_load_minus_r3((*r).s3);

    _bgl_load_reg0((*sp).s0);
    _bgl_load_reg0_up((*sp).s1);
    _bgl_load_reg1((*sp).s2);
    _bgl_load_reg1_up((*sp).s3);
    
    _bgl_add_to_reg0_reg1();
    _bgl_add_to_reg0_up_reg1_up();
      
    _bgl_add_r0_to_r2_reg1();
    _bgl_add_r1_to_r3_reg1_up();

    iy=g_idn[ix][0]; icy=iy;
    sm = k + icy;
    _prefetch_spinor(sm);
    um=&hf->gaugefield[iy][0];
    _prefetch_su3(um);

    _bgl_tensor_product_and_add();
    /* result in v now */
    _bgl_su3_times_v_dagger(*up);
    /* result in r now */
    _bgl_complex_times_r(ka0);
    _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor);

    /************** direction -0 ****************************/

    ddd = &hf->derivative[iy][0];
    _bgl_load_r0((*r).s0);
    _bgl_load_r1((*r).s1);
    _bgl_load_minus_r2((*r).s2);
    _bgl_load_minus_r3((*r).s3);

    _bgl_load_reg0((*sm).s0);
    _bgl_load_reg0_up((*sm).s1);
    _bgl_load_reg1((*sm).s2);
    _bgl_load_reg1_up((*sm).s3);

    _bgl_sub_from_reg0_reg1();
    _bgl_sub_from_reg0_up_reg1_up();

    _bgl_sub_from_r0_r2_reg1();
    _bgl_sub_from_r1_r3_reg1_up();

    iy=g_iup[ix][1]; icy=[iy];

    sp = k + icy;
    _prefetch_spinor(sp);
    up=&hf->gaugefield[ix][1];      
    _prefetch_su3(up);

    _bgl_tensor_product_and_add_d();
    /* result in v now */
    _bgl_su3_times_v_dagger(*um);
    /* result in r now */
    _bgl_complex_times_r(ka0);
    _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor);

    /*************** direction +1 **************************/

    ddd = &hf->derivative[ix][1];
    _bgl_load_r0((*r).s0);
    _bgl_load_r1((*r).s1);
    _bgl_load_minus_r2((*r).s2);
    _bgl_load_minus_r3((*r).s3);

    _bgl_load_reg0((*sp).s0);
    _bgl_load_reg0_up((*sp).s1);
    _bgl_load_reg1((*sp).s2);
    _bgl_load_reg1_up((*sp).s3);
    
    _bgl_i_mul_add_to_reg0_reg1_up();
    _bgl_i_mul_add_to_reg0_up_reg1();
      
    _bgl_i_mul_add_r0_to_r3_reg1();
    _bgl_i_mul_add_r1_to_r2_reg1_up();

    iy=g_idn[ix][1]; icy=iy;

    sm = k + icy;
    _prefetch_spinor(sm);
    um=&hf->gaugefield[iy][1];
    _prefetch_su3(um);

    _bgl_tensor_product_and_add();
    /* result in v now */
    _bgl_su3_times_v_dagger(*up);
    /* result in r now */
    _bgl_complex_times_r(ka1);
    _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor);

    /**************** direction -1 *************************/

    ddd = &hf->derivative[iy][1];
    _bgl_load_r0((*r).s0);
    _bgl_load_r1((*r).s1);
    _bgl_load_minus_r2((*r).s2);
    _bgl_load_minus_r3((*r).s3);

    _bgl_load_reg0((*sp).s0);
    _bgl_load_reg0_up((*sp).s1);
    _bgl_load_reg1((*sp).s2);
    _bgl_load_reg1_up((*sp).s3);
    
    _bgl_i_mul_sub_from_reg0_reg1_up();
    _bgl_i_mul_sub_from_reg0_up_reg1();
      
    _bgl_i_mul_sub_from_r0_r3_reg1();
    _bgl_i_mul_sub_from_r1_r2_reg1_up();

    iy=g_iup[ix][2]; icy=iy;

    sp = k + icy;
    _prefetch_spinor(sp);
    up=&hf->gaugefield[ix][2];
    _prefetch_su3(up);

    _bgl_tensor_product_and_add_d();
    /* result in v now */
    _bgl_su3_times_v_dagger(*um);
    /* result in r now */
    _bgl_complex_times_r(ka1);
    _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor);

    /*************** direction +2 **************************/

    ddd = &hf->derivative[ix][2];
    _bgl_load_r0((*r).s0);
    _bgl_load_r1((*r).s1);
    _bgl_load_minus_r2((*r).s2);
    _bgl_load_minus_r3((*r).s3);

    _bgl_load_reg0((*sp).s0);
    _bgl_load_reg0_up((*sp).s1);
    _bgl_load_reg1((*sp).s2);
    _bgl_load_reg1_up((*sp).s3);
    
    _bgl_add_to_reg0_reg1_up();
    _bgl_sub_from_reg0_up_reg1();
      
    _bgl_add_r0_to_r3_reg1();
    _bgl_sub_from_r1_r2_reg1_up();

    iy=g_idn[ix][2]; icy=iy;

    sm = k + icy;
    _prefetch_spinor(sm);
    um=&hf->gaugefield[iy][2];
    _prefetch_su3(um);

    _bgl_tensor_product_and_add();
    /* result in v now */
    _bgl_su3_times_v_dagger(*up);
    /* result in r now */
    _bgl_complex_times_r(ka2);
    _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor);
      
    /***************** direction -2 ************************/

    ddd = &hf->derivative[iy][2];
    _bgl_load_r0((*r).s0);
    _bgl_load_r1((*r).s1);
    _bgl_load_minus_r2((*r).s2);
    _bgl_load_minus_r3((*r).s3);

    _bgl_load_reg0((*sp).s0);
    _bgl_load_reg0_up((*sp).s1);
    _bgl_load_reg1((*sp).s2);
    _bgl_load_reg1_up((*sp).s3);
    
    _bgl_sub_from_reg0_reg1_up();
    _bgl_add_to_reg0_up_reg1();
      
    _bgl_sub_from_r0_r3_reg1();
    _bgl_add_r1_to_r2_reg1_up();

    iy=g_iup[ix][3]; icy=iy;

    sp = k + icy;
    _prefetch_spinor(sp);
    up=&hf->gaugefield[ix][3];
    _prefetch_su3(up);

    _bgl_tensor_product_and_add_d();
    /* result in v now */
    _bgl_su3_times_v_dagger(*um);
    /* result in r now */
    _bgl_complex_times_r(ka1);
    _bgl_trace_lambda_mul_add_assign(*ddd, 2.*factor);

    /****************** direction +3 ***********************/

    ddd = &hf->derivative[ix][3];
    _bgl_load_r0((*r).s0);
    _bgl_load_r1((*r).s1);
    _bgl_load_minus_r2((*r).s2);
    _bgl_load_minus_r3((*r).s3);

    _bgl_load_reg0((*sp).s0);
    _bgl_load_reg0_up((*sp).s1);
    _bgl_load_reg1((*sp).s2);
    _bgl_load_reg1_up((*sp).s3);
    
    _bgl_i_mul_add_to_reg0_reg1();
    _bgl_i_mul_sub_from_reg0_up_reg1_up();
      
    _bgl_i_mul_add_r0_to_r2_reg1();
    _bgl_i_mul_sub_from_r1_r3_reg1_up();

    iy=g_idn[ix][3]; icy=iy;

    sm = k + icy;
    _prefetch_spinor(sm);
    um=&hf->gaugefield[iy][3];
    _prefetch_su3(um);

    _bgl_tensor_product_and_add();
    /* result in v now */
    _bgl_su3_times_v_dagger(*up);
    /* result in r now */
    _bgl_complex_times_r(ka3);
    _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor);

    /***************** direction -3 ************************/

    ddd = &hf->derivative[iy][3];
    _bgl_load_r0((*r).s0);
    _bgl_load_r1((*r).s1);
    _bgl_load_minus_r2((*r).s2);
    _bgl_load_minus_r3((*r).s3);

    _bgl_load_reg0((*sp).s0);
    _bgl_load_reg0_up((*sp).s1);
    _bgl_load_reg1((*sp).s2);
    _bgl_load_reg1_up((*sp).s3);
    
    _bgl_i_mul_sub_from_reg0_reg1();
    _bgl_i_mul_add_to_reg0_up_reg1_up();
      
    _bgl_i_mul_sub_from_r0_r2_reg1();
    _bgl_i_mul_add_r1_to_r3_reg1_up();

    /* something wrong here...*/
    icz=icx+1;
    if(icz==((VOLUME+RAND)/2+ioff)) icz=ioff;
    iz=icz;
    iy=g_iup[iz][0]; icy=iy;

    sp = k + icy;
    _prefetch_spinor(sp);
    up=&hf->gaugefield[iz][0];
    _prefetch_su3(up);

    _bgl_tensor_product_and_add_d();
    /* result in v now */
    _bgl_su3_times_v_dagger(*um);
    /* result in r now */
    _bgl_complex_times_r(ka3);
    _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor);

    /****************** end of loop ************************/
  }
#ifdef _KOJAK_INST
#pragma pomp inst end(derivSb)
#endif
}
Beispiel #4
0
void mul_one_pm_imu_sub_mul(spinor * const l, spinor * const k, 
			    spinor * const j, const double _sign, const int N){
#ifdef OMP
#pragma omp parallel
  {
#endif
  _Complex double z,w;
  int ix;
  double sign=1.;
  spinor *r, *s, *t;

#if (!defined SSE2 && !defined SSE3)

  su3_vector ALIGN phi1, phi2, phi3, phi4;
  
#endif

  if(_sign < 0.){
    sign = -1.;
  }

  z = 1. + (sign * g_mu) * I;
  w = conj(z);
  /************ loop over all lattice sites ************/
#ifdef OMP
#pragma omp for
#endif
  for(ix = 0; ix < N; ix++){
    r = k+ix;
    s = j+ix;
    t = l+ix;
    /* Multiply the spinorfield with 1+imu\gamma_5 */
#if (defined SSE2 || defined SSE3)
    _prefetch_spinor((r+predist));
    _prefetch_spinor((s+predist));
    _sse_load_up(r->s0);
    _sse_vector_cmplx_mul(z);
    _sse_load(s->s0);
    _sse_vector_sub_up();
    _sse_store_nt_up(t->s0);
    _sse_load_up(r->s1);
    _sse_vector_cmplx_mul_two();
    _sse_load(s->s1);
    _sse_vector_sub_up();
    _sse_store_nt_up(t->s1);
    _sse_load_up(r->s2);
    _sse_vector_cmplx_mul(w);
    _sse_load(s->s2);
    _sse_vector_sub_up();
    _sse_store_nt_up(t->s2);
    _sse_load_up(r->s3);
    _sse_vector_cmplx_mul_two();
    _sse_load(s->s3);
    _sse_vector_sub_up();
    _sse_store_nt_up(t->s3);
#else
    _complex_times_vector(phi1, z, r->s0);
    _complex_times_vector(phi2, z, r->s1);
    _complex_times_vector(phi3, w, r->s2);
    _complex_times_vector(phi4, w, r->s3);
    /* Subtract s and store the result in t */
    _vector_sub(t->s0, phi1, s->s0);
    _vector_sub(t->s1, phi2, s->s1);
    _vector_sub(t->s2, phi3, s->s2);
    _vector_sub(t->s3, phi4, s->s3);
#endif
  }

#ifdef OMP
  } /* OpenMP closing brace */
#endif
}
Beispiel #5
0
/* Serially Checked ! */
void Dtm_psi(spinor * const P, spinor * const Q){

  if(P==Q){
    printf("Error in Dtm_psi (D_psi.c):\n");
    printf("Arguments must be differen spinor fields\n");
    printf("Program aborted\n");
    exit(1);
  }

#ifdef _GAUGE_COPY2
  if(g_update_gauge_copy) {
    update_backward_gauge(g_gauge_field);
  }
#endif

# if defined TM_USE_MPI
  xchange_lexicfield(Q);
# endif

#ifdef TM_USE_OMP
#pragma omp parallel
  {
#endif
    int ix,iy,iz;
    su3 *up,*um;
    spinor *s,*sp,*sm,*rn;
    _Complex double fact1, fact2;
    spinor rs __attribute__ ((aligned (16)));

    fact1 = 1. + g_mu * I;
    fact2 = conj(fact1);

#ifndef TM_USE_OMP
    iy=g_iup[0][0];
    sp=(spinor *) Q + iy;
    up=&g_gauge_field[0][0];
#endif

    /************************ loop over all lattice sites *************************/
#ifdef TM_USE_OMP
#pragma omp for
#endif
    for (ix=0;ix<VOLUME;ix++){
#ifdef TM_USE_OMP
      iy=g_iup[ix][0];
      up=&g_gauge_field[ix][0];
      sp=(spinor *) Q + iy;
#endif
      s=(spinor *) Q + ix;
      _prefetch_spinor(s);

      /******************************* direction +0 *********************************/

      iy=g_idn[ix][0];
      
      sm = (spinor *) Q + iy;
      _prefetch_spinor(sm);       

      _sse_load(sp->s0);
      _sse_load_up(sp->s2);
      _sse_vector_add();

      _sse_su3_multiply((*up));
      _sse_vector_cmplx_mul(phase_0);
      _sse_store_up(rs.s2);

      // the diagonal bit
      _sse_load_up(s->s0);
      _sse_vector_cmplx_mul(fact1);
      _sse_load(rs.s2);
      _sse_vector_add();
      _sse_store(rs.s0);

      // g5 in the twisted term
      _sse_load_up(s->s2);
      _sse_vector_cmplx_mul(fact2);
      _sse_load(rs.s2);
      _sse_vector_add();
      _sse_store(rs.s2);      
      
      um=&g_gauge_field[iy][0];
      _prefetch_su3(um);
      
      _sse_load(sp->s1);
      _sse_load_up(sp->s3);
      _sse_vector_add();
      
      _sse_su3_multiply((*up));
      _sse_vector_cmplx_mul(phase_0);
      _sse_store_up(rs.s3);
    
      // the diagonal bit
      _sse_load_up(s->s1);
      _sse_vector_cmplx_mul(fact1);
      _sse_load(rs.s3);
      _sse_vector_add();
      _sse_store(rs.s1);

      // g5 in the twisted term
      _sse_load_up(s->s3);
      _sse_vector_cmplx_mul(fact2);
      _sse_load(rs.s3);
      _sse_vector_add();
      _sse_store(rs.s3); 

      /******************************* direction -0 *********************************/

      iy=g_iup[ix][1];

      sp = (spinor *) Q + iy;
      _prefetch_spinor(sp);

      _sse_load(sm->s0);
      _sse_load_up(sm->s2);
      _sse_vector_sub();
      
      _sse_su3_inverse_multiply((*um));
      _sse_vector_cmplxcg_mul(phase_0);
      _sse_load(rs.s0);
      _sse_vector_add();
      _sse_store(rs.s0);

      _sse_load(rs.s2);
      _sse_vector_sub();
      _sse_store(rs.s2);
      
      up+=1;
      _prefetch_su3(up);
      
      _sse_load(sm->s1);
      _sse_load_up(sm->s3);
      _sse_vector_sub();
      
      _sse_su3_inverse_multiply((*um));
      _sse_vector_cmplxcg_mul(phase_0);
      _sse_load(rs.s1);
      _sse_vector_add();
      _sse_store(rs.s1);

      _sse_load(rs.s3);
      _sse_vector_sub();
      _sse_store(rs.s3);
      
      /******************************* direction +1 *********************************/

      iy=g_idn[ix][1];
      
      sm = (spinor *) Q + iy;
      _prefetch_spinor(sm);

      _sse_load(sp->s0);
      _sse_load_up(sp->s3);
      _sse_vector_i_mul();
      _sse_vector_add();

      _sse_su3_multiply((*up));
      _sse_vector_cmplx_mul(phase_1);
      _sse_load(rs.s0);
      _sse_vector_add();
      _sse_store(rs.s0);

      _sse_load(rs.s3);
      _sse_vector_i_mul();      
      _sse_vector_sub();
      _sse_store(rs.s3); 
      
      um=&g_gauge_field[iy][1];
      _prefetch_su3(um);

      _sse_load(sp->s1);
      _sse_load_up(sp->s2);
      _sse_vector_i_mul();
      _sse_vector_add();

      _sse_su3_multiply((*up));
      _sse_vector_cmplx_mul(phase_1);
      _sse_load(rs.s1);
      _sse_vector_add();
      _sse_store(rs.s1);

      _sse_load(rs.s2);
      _sse_vector_i_mul();      
      _sse_vector_sub();
      _sse_store(rs.s2);       

      /******************************* direction -1 *********************************/

      iy=g_iup[ix][2];

      sp = (spinor *) Q + iy;
      _prefetch_spinor(sp);

      _sse_load(sm->s0);
      _sse_load_up(sm->s3);
      _sse_vector_i_mul();
      _sse_vector_sub();
      
      _sse_su3_inverse_multiply((*um));
      _sse_vector_cmplxcg_mul(phase_1);
      _sse_load(rs.s0);
      _sse_vector_add();
      _sse_store(rs.s0);

      _sse_load(rs.s3);
      _sse_vector_i_mul();      
      _sse_vector_add();
      _sse_store(rs.s3);

      up+=1;
      _prefetch_su3(up);

      _sse_load(sm->s1);
      _sse_load_up(sm->s2);
      _sse_vector_i_mul();
      _sse_vector_sub();
      
      _sse_su3_inverse_multiply((*um));
      _sse_vector_cmplxcg_mul(phase_1);
      _sse_load(rs.s1);
      _sse_vector_add();
      _sse_store(rs.s1);

      _sse_load(rs.s2);
      _sse_vector_i_mul();      
      _sse_vector_add();
      _sse_store(rs.s2);

      /******************************* direction +2 *********************************/

      iy=g_idn[ix][2];

      sm = (spinor *) Q + iy;
      _prefetch_spinor(sm);

      _sse_load(sp->s0);
      _sse_load_up(sp->s3);
      _sse_vector_add();

      _sse_su3_multiply((*up));
      _sse_vector_cmplx_mul(phase_2);
      _sse_load(rs.s0);
      _sse_vector_add();
      _sse_store(rs.s0);

      _sse_load(rs.s3);
      _sse_vector_add();
      _sse_store(rs.s3);
      
      um=&g_gauge_field[iy][2];
      _prefetch_su3(um);

      _sse_load(sp->s1);
      _sse_load_up(sp->s2);
      _sse_vector_sub();

      _sse_su3_multiply((*up));
      _sse_vector_cmplx_mul(phase_2);
      _sse_load(rs.s1);
      _sse_vector_add();
      _sse_store(rs.s1);

      _sse_load(rs.s2);
      _sse_vector_sub();
      _sse_store(rs.s2);      

      /******************************* direction -2 *********************************/

      iy=g_iup[ix][3];

      sp = (spinor *) Q + iy;
      _prefetch_spinor(sp);

      _sse_load(sm->s0);
      _sse_load_up(sm->s3);
      _sse_vector_sub();
      
      _sse_su3_inverse_multiply((*um));
      _sse_vector_cmplxcg_mul(phase_2);
      _sse_load(rs.s0);
      _sse_vector_add();
      _sse_store(rs.s0);

      _sse_load(rs.s3);
      _sse_vector_sub();
      _sse_store(rs.s3);
      
      up+=1;
      _prefetch_su3(up);

      _sse_load(sm->s1);
      _sse_load_up(sm->s2);
      _sse_vector_add();
      
      _sse_su3_inverse_multiply((*um));
      _sse_vector_cmplxcg_mul(phase_2);
      _sse_load(rs.s1);
      _sse_vector_add();
      _sse_store(rs.s1);

      _sse_load(rs.s2);
      _sse_vector_add();
      _sse_store(rs.s2);      
      
      /******************************* direction +3 *********************************/

      iy=g_idn[ix][3];

      sm = (spinor *) Q + iy;
      _prefetch_spinor(sm);

      _sse_load(sp->s0);
      _sse_load_up(sp->s2);
      _sse_vector_i_mul();
      _sse_vector_add();

      _sse_su3_multiply((*up));
      _sse_vector_cmplx_mul(phase_3);
      _sse_load(rs.s0);
      _sse_vector_add();
      _sse_store(rs.s0);

      _sse_load(rs.s2);
      _sse_vector_i_mul();      
      _sse_vector_sub();
      _sse_store(rs.s2);
      
      um=&g_gauge_field[iy][3];
      _prefetch_su3(um);

      _sse_load(sp->s1);
      _sse_load_up(sp->s3);
      _sse_vector_i_mul();
      _sse_vector_sub();

      _sse_su3_multiply((*up));
      _sse_vector_cmplx_mul(phase_3);
      _sse_load(rs.s1);
      _sse_vector_add();
      _sse_store(rs.s1);

      _sse_load(rs.s3);
      _sse_vector_i_mul();      
      _sse_vector_add();
      _sse_store(rs.s3);
      
      /******************************* direction -3 *********************************/

      iz=(ix+1+VOLUME)%VOLUME;

      iy=g_iup[iz][0];
      
      sp = (spinor *) Q + iy;
      _prefetch_spinor(sp);

      _sse_load(sm->s0);
      _sse_load_up(sm->s2);
      _sse_vector_i_mul();
      _sse_vector_sub();
      
      _sse_su3_inverse_multiply((*um));
      _sse_vector_cmplxcg_mul(phase_3);
      rn = (spinor *) P + ix;
      
      _sse_load(rs.s0);
      _sse_vector_add();
      _sse_store_nt(rn->s0);

      _sse_load(rs.s2);
      _sse_vector_i_mul();      
      _sse_vector_add();
      _sse_store_nt(rn->s2);

      up=&g_gauge_field[iz][0];
      _prefetch_su3(up);

      _sse_load(sm->s1);
      _sse_load_up(sm->s3);
      _sse_vector_i_mul();
      _sse_vector_add();
      
      _sse_su3_inverse_multiply((*um));
      _sse_vector_cmplxcg_mul(phase_3);
      _sse_load(rs.s1);
      _sse_vector_add();
      _sse_store_nt(rn->s1);

      _sse_load(rs.s3);
      _sse_vector_i_mul();      
      _sse_vector_sub();
      _sse_store_nt(rn->s3);
      
      /******************************** end of loop *********************************/

    }
#ifdef TM_USE_OMP
  } /* OpenMP closing brace */
#endif
}
Beispiel #6
0
/* this is the hopping part only */
void local_H(spinor * const rr, spinor * const s, su3 * u, int * _idx) {

  int * idx = _idx;
  su3 * restrict up ALIGN;
  su3 * restrict um ALIGN;
  spinor * restrict sp ALIGN;
  spinor * restrict sm ALIGN;

#pragma disjoint(*s, *sp, *sm, *rr, *up, *um)

  __alignx(16,rr);
  __alignx(16,s);

  /*********************** direction +0 ************************/
  up = u;
  sp = (spinor *) s + (*idx);
  idx++;

  um = up+1;
  _prefetch_su3(um); 
  sm = (spinor *) s + (*idx);
  _prefetch_spinor(sm); 
  idx++;

  _bgl_load_reg0(sp->s0);
  _bgl_load_reg1(sp->s1);
  _bgl_load_reg0_up(sp->s2);
  _bgl_load_reg1_up(sp->s3);
  _bgl_vector_add_reg0();
  _bgl_vector_add_reg1();
  /* result is now in regx0, regx1, regx2 x = 0,1 */
  
  _bgl_su3_multiply_double((*up));
  _bgl_vector_cmplx_mul_double(phase_0);
  _bgl_add_to_rs0_reg0();
  _bgl_add_to_rs2_reg0();
  _bgl_add_to_rs1_reg1();
  _bgl_add_to_rs3_reg1();

  /*********************** direction -0 ************************/
  up = um+1;
  _prefetch_su3(up); 
  sp = (spinor*) s + (*idx);
  _prefetch_spinor(sp); 
  idx++;

  _bgl_load_reg0(sm->s0);
  _bgl_load_reg1(sm->s1);
  _bgl_load_reg0_up(sm->s2);
  _bgl_load_reg1_up(sm->s3);
  _bgl_vector_sub_reg0();
  _bgl_vector_sub_reg1();
  
  _bgl_su3_inverse_multiply_double((*um));
  _bgl_vector_cmplxcg_mul_double(phase_0);
  
  _bgl_add_to_rs0_reg0();
  _bgl_sub_from_rs2_reg0();
  _bgl_add_to_rs1_reg1();
  _bgl_sub_from_rs3_reg1();
  
  /*********************** direction +1 ************************/
  
  um = up+1;
  _prefetch_su3(um); 
  sm = (spinor*) s + (*idx);
  _prefetch_spinor(sm); 
  idx++;

  _bgl_load_reg0(sp->s0);
  _bgl_load_reg1(sp->s1);
  _bgl_load_reg0_up(sp->s3);
  _bgl_load_reg1_up(sp->s2);
  _bgl_vector_i_mul_add_reg0();
  _bgl_vector_i_mul_add_reg1();
  
  _bgl_su3_multiply_double((*up));
  _bgl_vector_cmplx_mul_double(phase_1);
  
  _bgl_add_to_rs0_reg0();
  _bgl_i_mul_sub_from_rs3_reg0();
  _bgl_add_to_rs1_reg1();
  _bgl_i_mul_sub_from_rs2_reg1();
  
  /*********************** direction -1 ************************/

  up = um+1;
  _prefetch_su3(up); 
  sp = (spinor*) s + (*idx);
  _prefetch_spinor(sp); 
  idx++;

  _bgl_load_reg0(sm->s0);
  _bgl_load_reg1(sm->s1);
  _bgl_load_reg0_up(sm->s3);
  _bgl_load_reg1_up(sm->s2);
  _bgl_vector_i_mul_sub_reg0();
  _bgl_vector_i_mul_sub_reg1();
  
  _bgl_su3_inverse_multiply_double((*um));
  _bgl_vector_cmplxcg_mul_double(phase_1);
  
  _bgl_add_to_rs0_reg0();
  _bgl_add_to_rs1_reg1();
  _bgl_i_mul_add_to_rs3_reg0();
  _bgl_i_mul_add_to_rs2_reg1();      
  
  /*********************** direction +2 ************************/
  
  um = up+1;
  _prefetch_su3(um); 
  sm = (spinor*) s + (*idx);
  _prefetch_spinor(sm); 
  idx++;

  _bgl_load_reg0(sp->s0);
  _bgl_load_reg1(sp->s1);
  _bgl_load_reg1_up(sp->s2);
  _bgl_load_reg0_up(sp->s3);
  _bgl_vector_add_reg0();
  _bgl_vector_sub_reg1();
  
  _bgl_su3_multiply_double((*up));
  _bgl_vector_cmplx_mul_double(phase_2);
  
  _bgl_add_to_rs0_reg0();
  _bgl_add_to_rs1_reg1();
  _bgl_sub_from_rs2_reg1();
  _bgl_add_to_rs3_reg0();
  

  /*********************** direction -2 ************************/
  up = um+1;
  _prefetch_su3(up); 
  sp = (spinor*) s + (*idx);
  _prefetch_spinor(sp); 
  idx++;

  _bgl_load_reg0(sm->s0);
  _bgl_load_reg1(sm->s1);
  _bgl_load_reg1_up(sm->s2);
  _bgl_load_reg0_up(sm->s3);
  _bgl_vector_sub_reg0();
  _bgl_vector_add_reg1();
  
  _bgl_su3_inverse_multiply_double((*um));
  _bgl_vector_cmplxcg_mul_double(phase_2);
  
  _bgl_add_to_rs0_reg0();
  _bgl_add_to_rs1_reg1();
  _bgl_add_to_rs2_reg1();
  _bgl_sub_from_rs3_reg0();
  
  /*********************** direction +3 ************************/
  um = up+1;
  _prefetch_su3(um); 
  sm = (spinor*) s + (*idx);
  _prefetch_spinor(sm); 

  _bgl_load_reg0(sp->s0);
  _bgl_load_reg1(sp->s1);
  _bgl_load_reg0_up(sp->s2);
  _bgl_load_reg1_up(sp->s3);
  _bgl_vector_i_mul_add_reg0();
  _bgl_vector_i_mul_sub_reg1();
  
  _bgl_su3_multiply_double((*up));
  _bgl_vector_cmplx_mul_double(phase_3);
  
  _bgl_add_to_rs0_reg0();
  _bgl_add_to_rs1_reg1();
  _bgl_i_mul_sub_from_rs2_reg0();
  _bgl_i_mul_add_to_rs3_reg1();
  
  /*********************** direction -3 ************************/

  _bgl_load_reg0(sm->s0);
  _bgl_load_reg1(sm->s1);
  _bgl_load_reg0_up(sm->s2);
  _bgl_load_reg1_up(sm->s3);
  _bgl_vector_i_mul_sub_reg0();
  _bgl_vector_i_mul_add_reg1();
  
  _bgl_su3_inverse_multiply_double((*um));
  _bgl_vector_cmplxcg_mul_double(phase_3);
  
  _bgl_add_to_rs0_reg0();
  _bgl_store_rs0(rr->s0);
  _bgl_i_mul_add_to_rs2_reg0();
  _bgl_store_rs2(rr->s2);
  
  _bgl_add_to_rs1_reg1();
  _bgl_store_rs1(rr->s1);
  _bgl_i_mul_sub_from_rs3_reg1();
  _bgl_store_rs3(rr->s3);

}
Beispiel #7
0
void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){
  int ix;
  su3 * restrict ALIGN U;
  spinor * restrict ALIGN s;
  halfspinor * restrict * phi ALIGN;
  halfspinor32 * restrict * phi32 ALIGN;
  /* We have 32 registers available */
  _declare_hregs();

#ifdef _KOJAK_INST
#pragma pomp inst begin(hoppingmatrix)
#endif
#pragma disjoint(*s, *U)

#ifdef _GAUGE_COPY
  if(g_update_gauge_copy) {
    update_backward_gauge(g_gauge_field);
  }
#endif

  __alignx(16, l);
  __alignx(16, k);
  if(g_sloppy_precision == 1 && g_sloppy_precision_flag == 1) {
    __alignx(16, HalfSpinor32);
    /* We will run through the source vector now */
    /* instead of the solution vector            */
    s = k;
    _prefetch_spinor(s); 

    /* s contains the source vector */

    if(ieo == 0) {
      U = g_gauge_field_copy[0][0];
    }
    else {
      U = g_gauge_field_copy[1][0];
    }
    phi32 = NBPointer32[ieo];

    _prefetch_su3(U);
    /**************** loop over all lattice sites ******************/
    ix=0;
    for(int i = 0; i < (VOLUME)/2; i++){
      /*********************** direction +0 ************************/
      _hop_t_p_pre32();
      s++; 
      U++;
      ix++;

      /*********************** direction -0 ************************/
      _hop_t_m_pre32();
      ix++;

      /*********************** direction +1 ************************/
      _hop_x_p_pre32();
      ix++;
      U++;

      /*********************** direction -1 ************************/
      _hop_x_m_pre32();
      ix++;

      /*********************** direction +2 ************************/
      _hop_y_p_pre32();

      ix++;
      U++;

      /*********************** direction -2 ************************/
      _hop_y_m_pre32();
      ix++;

      /*********************** direction +3 ************************/
      _hop_z_p_pre32();
      ix++;
      U++;

      /*********************** direction -3 ************************/
      _hop_z_m_pre32();
      ix++;

      /************************ end of loop ************************/
    }

#    if (defined TM_USE_MPI && !defined _NO_COMM)
    xchange_halffield32(); 
#    endif
    s = l;
    phi32 = NBPointer32[2 + ieo];
    if(ieo == 0) {
      U = g_gauge_field_copy[1][0];
    }
    else {
      U = g_gauge_field_copy[0][0];
    }
    //_prefetch_halfspinor(phi32[0]);
    _prefetch_su3(U);
  
    /* Now we sum up and expand to a full spinor */
    ix = 0;
    /*   _prefetch_spinor_for_store(s); */
    for(int i = 0; i < (VOLUME)/2; i++){
      /* This causes a lot of trouble, do we understand this? */
      /*     _prefetch_spinor_for_store(s); */
      //_prefetch_halfspinor(phi32[ix+1]);
      /*********************** direction +0 ************************/
      _hop_t_p_post32();
      ix++;
      /*********************** direction -0 ************************/
      _hop_t_m_post32();
      U++;
      ix++;
      /*********************** direction +1 ************************/
      _hop_x_p_post32();
      ix++;
      /*********************** direction -1 ************************/
      _hop_x_m_post32();
      U++;
      ix++;
      /*********************** direction +2 ************************/
      _hop_y_p_post32();
      ix++;
      /*********************** direction -2 ************************/
      _hop_y_m_post32();
      U++;
      ix++;
      /*********************** direction +3 ************************/
      _hop_z_p_post32();
      ix++;
      /*********************** direction -3 ************************/
      _hop_z_m_post32();
      U++;
      ix++;
      s++;
    }
  }
  else {
    __alignx(16, HalfSpinor);
    /* We will run through the source vector now */
    /* instead of the solution vector            */
    s = k;
    _prefetch_spinor(s); 

    /* s contains the source vector */

    if(ieo == 0) {
      U = g_gauge_field_copy[0][0];
    }
    else {
      U = g_gauge_field_copy[1][0];
    }
    phi = NBPointer[ieo];

    _prefetch_su3(U);
    /**************** loop over all lattice sites ******************/
    ix=0;
    for(int i = 0; i < (VOLUME)/2; i++){
      /*********************** direction +0 ************************/
      _hop_t_p_pre();
      s++; 
      U++;
      ix++;

      /*********************** direction -0 ************************/
      _hop_t_m_pre();
      ix++;

      /*********************** direction +1 ************************/
      _hop_x_p_pre();
      ix++;
      U++;

      /*********************** direction -1 ************************/
      _hop_x_m_pre();
      ix++;


      /*********************** direction +2 ************************/
      _hop_y_p_pre();
      ix++;
      U++;

      /*********************** direction -2 ************************/
      _hop_y_m_pre();
      ix++;

      /*********************** direction +3 ************************/
      _hop_z_p_pre();
      ix++;
      U++;

      /*********************** direction -3 ************************/
      _hop_z_m_pre();
      ix++;

      /************************ end of loop ************************/

    }

#    if (defined TM_USE_MPI && !defined _NO_COMM)
    xchange_halffield(); 
#    endif
    s = l;
    phi = NBPointer[2 + ieo];
    //_prefetch_halfspinor(phi[0]);
    if(ieo == 0) {
      U = g_gauge_field_copy[1][0];
    }
    else {
      U = g_gauge_field_copy[0][0];
    }
    _prefetch_su3(U);
  
    /* Now we sum up and expand to a full spinor */
    ix = 0;
    /*   _prefetch_spinor_for_store(s); */
    for(int i = 0; i < (VOLUME)/2; i++){
      /* This causes a lot of trouble, do we understand this? */
      /*     _prefetch_spinor_for_store(s); */
      //_prefetch_halfspinor(phi[ix+1]);
      /*********************** direction +0 ************************/
      _hop_t_p_post();
      ix++;
      /*********************** direction -0 ************************/
      _hop_t_m_post();
      U++;
      ix++;
      /*********************** direction +1 ************************/
      _hop_x_p_post();
      ix++;
      /*********************** direction -1 ************************/
      _hop_x_m_post();
      U++;
      ix++;
      /*********************** direction +2 ************************/
      _hop_y_p_post();
      ix++;
      /*********************** direction -2 ************************/
      _hop_y_m_post();
      U++;
      ix++;
      /*********************** direction +3 ************************/
      _hop_z_p_post();
      ix++;
      /*********************** direction -3 ************************/
      _hop_z_m_post();
      U++;
      ix++;
      s++;
    }
  }
#ifdef _KOJAK_INST
#pragma pomp inst end(hoppingmatrix)
#endif
}
Beispiel #8
0
void diff(spinor * const Q,spinor * const R,spinor * const S, const int N)
{
  int ix = 1;
  double *s ALIGN;
  double *sp ALIGN;
  double *r ALIGN;
  double *rp ALIGN;
  double *q ALIGN;
  double _Complex x00, x01, x02, x03, x04, x05, x06, x07, 
    x08, x09, x10, x11;
  double _Complex y00, y01, y02, y03, y04, y05, y06, y07, 
    y08, y09, y10, y11;
#pragma disjoint(*R, *S)

  __alignx(16, Q);
  __alignx(16, R);
  __alignx(16, S);
  r = (double*) R;
  s = (double*) S;
  q = (double*) Q;
  rp = r + 24;
  sp = s + 24;
  _prefetch_spinor(rp);
  _prefetch_spinor(sp);
  x00 = __lfpd(r);    
  x01 = __lfpd(r+2);  
  x02 = __lfpd(r+4);  
  x03 = __lfpd(r+6);  
  x04 = __lfpd(r+8);  
  x05 = __lfpd(r+10); 
  x06 = __lfpd(r+12); 
  x07 = __lfpd(r+14); 
  x08 = __lfpd(r+16); 
  x09 = __lfpd(r+18); 
  x10 = __lfpd(r+20); 
  x11 = __lfpd(r+22); 
  y00 = __lfpd(s);   
  y01 = __lfpd(s+2); 
  y02 = __lfpd(s+4); 
  y03 = __lfpd(s+6); 
  y04 = __lfpd(s+8); 
  y05 = __lfpd(s+10);
  y06 = __lfpd(s+12);
  y07 = __lfpd(s+14);
  y08 = __lfpd(s+16);
  y09 = __lfpd(s+18);
  y10 = __lfpd(s+20);
  y11 = __lfpd(s+22);

  __stfpd(q, __fpsub(x00, y00));
  __stfpd(q+2, __fpsub(x01, y01));
  __stfpd(q+4, __fpsub(x02, y02));
  __stfpd(q+6, __fpsub(x03, y03));
  __stfpd(q+8, __fpsub(x04, y04));
  __stfpd(q+10, __fpsub(x05, y05));
  __stfpd(q+12, __fpsub(x06, y06));
  __stfpd(q+14, __fpsub(x07, y07));
  __stfpd(q+16, __fpsub(x08, y08));
  __stfpd(q+18, __fpsub(x09, y09));
  __stfpd(q+20, __fpsub(x10, y10));
  __stfpd(q+22, __fpsub(x11, y11));
  s = sp;
  r = rp;
  q+=24;
#pragma unroll(12)
  for(ix = 1; ix < N-1; ix++) {
    rp+=24;
    sp+=24;
    _prefetch_spinor(rp);
    _prefetch_spinor(sp);
    x00 = __lfpd(r);    
    x01 = __lfpd(r+2);  
    x02 = __lfpd(r+4);  
    x03 = __lfpd(r+6);  
    x04 = __lfpd(r+8);  
    x05 = __lfpd(r+10); 
    x06 = __lfpd(r+12); 
    x07 = __lfpd(r+14); 
    x08 = __lfpd(r+16); 
    x09 = __lfpd(r+18); 
    x10 = __lfpd(r+20); 
    x11 = __lfpd(r+22); 
    y00 = __lfpd(s);   
    y01 = __lfpd(s+2); 
    y02 = __lfpd(s+4); 
    y03 = __lfpd(s+6); 
    y04 = __lfpd(s+8); 
    y05 = __lfpd(s+10);
    y06 = __lfpd(s+12);
    y07 = __lfpd(s+14);
    y08 = __lfpd(s+16);
    y09 = __lfpd(s+18);
    y10 = __lfpd(s+20);
    y11 = __lfpd(s+22);
    
    __stfpd(q, __fpsub(x00, y00));
    __stfpd(q+2, __fpsub(x01, y01));
    __stfpd(q+4, __fpsub(x02, y02));
    __stfpd(q+6, __fpsub(x03, y03));
    __stfpd(q+8, __fpsub(x04, y04));
    __stfpd(q+10, __fpsub(x05, y05));
    __stfpd(q+12, __fpsub(x06, y06));
    __stfpd(q+14, __fpsub(x07, y07));
    __stfpd(q+16, __fpsub(x08, y08));
    __stfpd(q+18, __fpsub(x09, y09));
    __stfpd(q+20, __fpsub(x10, y10));
    __stfpd(q+22, __fpsub(x11, y11));
    s = sp;
    r = rp;
    q+=24;
  }
  x00 = __lfpd(r);    
  x01 = __lfpd(r+2);  
  x02 = __lfpd(r+4);  
  x03 = __lfpd(r+6);  
  x04 = __lfpd(r+8);  
  x05 = __lfpd(r+10); 
  x06 = __lfpd(r+12); 
  x07 = __lfpd(r+14); 
  x08 = __lfpd(r+16); 
  x09 = __lfpd(r+18); 
  x10 = __lfpd(r+20); 
  x11 = __lfpd(r+22); 
  y00 = __lfpd(s);   
  y01 = __lfpd(s+2); 
  y02 = __lfpd(s+4); 
  y03 = __lfpd(s+6); 
  y04 = __lfpd(s+8); 
  y05 = __lfpd(s+10);
  y06 = __lfpd(s+12);
  y07 = __lfpd(s+14);
  y08 = __lfpd(s+16);
  y09 = __lfpd(s+18);
  y10 = __lfpd(s+20);
  y11 = __lfpd(s+22);

  __stfpd(q, __fpsub(x00, y00));
  __stfpd(q+2, __fpsub(x01, y01));
  __stfpd(q+4, __fpsub(x02, y02));
  __stfpd(q+6, __fpsub(x03, y03));
  __stfpd(q+8, __fpsub(x04, y04));
  __stfpd(q+10, __fpsub(x05, y05));
  __stfpd(q+12, __fpsub(x06, y06));
  __stfpd(q+14, __fpsub(x07, y07));
  __stfpd(q+16, __fpsub(x08, y08));
  __stfpd(q+18, __fpsub(x09, y09));
  __stfpd(q+20, __fpsub(x10, y10));
  __stfpd(q+22, __fpsub(x11, y11));

  return;
}
Beispiel #9
0
void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){
  int ix, i;
  su3 * restrict U ALIGN;
  static spinor rs;
  spinor * restrict s ALIGN;
  halfspinor ** phi ALIGN;
#if defined OPTERON
  const int predist=2;
#else
  const int predist=1;
#endif
#ifdef _KOJAK_INST
#pragma pomp inst begin(hoppingmatrix)
#endif

#ifdef _GAUGE_COPY
  if(g_update_gauge_copy) {
    update_backward_gauge();
  }
#endif
  /* We will run through the source vector now */
  /* instead of the solution vector            */
  s = k;
  _prefetch_spinor(s);

  if(ieo == 0) {
    U = g_gauge_field_copy[0][0];
  }
  else {
    U = g_gauge_field_copy[1][0];
  }
  phi = NBPointer[ieo];

  _prefetch_su3(U);
  /**************** loop over all lattice sites ******************/
  ix=0;
  for(i = 0; i < (VOLUME)/2; i++){

    /*********************** direction +0 ************************/
    _prefetch_su3(U+predist);

    _sse_load((*s).s0);
    _sse_load_up((*s).s2);
    _sse_vector_add();

    _sse_su3_multiply((*U));
    _sse_vector_cmplx_mul(ka0);
    _sse_store_nt_up((*phi[ix]).s0);

    _sse_load((*s).s1);
    _sse_load_up((*s).s3);
    _sse_vector_add();
      
    _sse_su3_multiply((*U));
    _sse_vector_cmplx_mul(ka0);
    _sse_store_nt_up((*phi[ix]).s1);
    U++;
    ix++;
    /*********************** direction -0 ************************/
    _sse_load((*s).s0);
    _sse_load_up((*s).s2);
    _sse_vector_sub();
    _sse_store_nt((*phi[ix]).s0);

    _sse_load((*s).s1);
    _sse_load_up((*s).s3);
    _sse_vector_sub();
    _sse_store_nt((*phi[ix]).s1);
    ix++;

    /*********************** direction +1 ************************/
    _prefetch_su3(U+predist);

    _sse_load((*s).s0);
    /*next not needed?*/
    _sse_load_up((*s).s3);
    _sse_vector_i_mul();
    _sse_vector_add();

    _sse_su3_multiply((*U));
    _sse_vector_cmplx_mul(ka1);
    _sse_store_nt_up((*phi[ix]).s0);

    _sse_load((*s).s1);
    _sse_load_up((*s).s2);
    _sse_vector_i_mul();
    _sse_vector_add();

    _sse_su3_multiply((*U));
    _sse_vector_cmplx_mul(ka1);
    _sse_store_nt_up((*phi[ix]).s1);
    ix++;
    U++;

    /*********************** direction -1 ************************/
    _sse_load((*s).s0);
    _sse_load_up((*s).s3);
    _sse_vector_i_mul();
    _sse_vector_sub();
    _sse_store_nt((*phi[ix]).s0);

    _sse_load((*s).s1);
    _sse_load_up((*s).s2);
    _sse_vector_i_mul();
    _sse_vector_sub();
    _sse_store_nt((*phi[ix]).s1);
    ix++;

    /*********************** direction +2 ************************/
    _prefetch_su3(U+predist);

    _sse_load((*s).s0);
    _sse_load_up((*s).s3);
    _sse_vector_add();

    _sse_su3_multiply((*U));
    _sse_vector_cmplx_mul(ka2);
    _sse_store_nt_up((*phi[ix]).s0);

    _sse_load((*s).s1);
    _sse_load_up((*s).s2);
    _sse_vector_sub();

    _sse_su3_multiply((*U));
    _sse_vector_cmplx_mul(ka2);
    _sse_store_nt_up((*phi[ix]).s1);
    ix++;
    U++;
    /*********************** direction -2 ************************/
    _sse_load((*s).s0);
    _sse_load_up((*s).s3);
    _sse_vector_sub();
    _sse_store_nt((*phi[ix]).s0);

    _sse_load((*s).s1);
    _sse_load_up((*s).s2);
    _sse_vector_add();
    _sse_store_nt((*phi[ix]).s1);
    ix++;

    /*********************** direction +3 ************************/
    _prefetch_su3(U+predist);
    _prefetch_spinor(s+1);

    _sse_load((*s).s0);
    _sse_load_up((*s).s2);
    _sse_vector_i_mul();
    _sse_vector_add();

    _sse_su3_multiply((*U));
    _sse_vector_cmplx_mul(ka3);
    _sse_store_nt_up((*phi[ix]).s0);

    _sse_load((*s).s1);
    _sse_load_up((*s).s3);
    _sse_vector_i_mul();
    _sse_vector_sub();

    _sse_su3_multiply((*U));
    _sse_vector_cmplx_mul(ka3);
    _sse_store_nt_up((*phi[ix]).s1);
    ix++;
    U++;

    /*********************** direction -3 ************************/
    _sse_load((*s).s0);
    _sse_load_up((*s).s2);
    _sse_vector_i_mul();
    _sse_vector_sub();
    _sse_store_nt((*phi[ix]).s0);

    _sse_load((*s).s1);
    _sse_load_up((*s).s3);
    _sse_vector_i_mul();
    _sse_vector_add();
    _sse_store_nt((*phi[ix]).s1);
    ix++;
    s++;
  }

#    if (defined MPI && !defined _NO_COMM)
  xchange_halffield(); 
#    endif
  s = l;
  phi = NBPointer[2 + ieo];
  if(ieo == 0) {
    U = g_gauge_field_copy[1][0];
  }
  else {
    U = g_gauge_field_copy[0][0];
  }
  _prefetch_su3(U);
  
  /* Now we sum up and expand to a full spinor */
  ix = 0;
  for(i = 0; i < (VOLUME)/2; i++){
    /*********************** direction +0 ************************/
    _vector_assign(rs.s0, (*phi[ix]).s0);
    _vector_assign(rs.s2, (*phi[ix]).s0);
    _vector_assign(rs.s1, (*phi[ix]).s1);
    _vector_assign(rs.s3, (*phi[ix]).s1);
    ix++;

    /*********************** direction -0 ************************/
    _prefetch_su3(U+predist);
      
    _sse_load((*phi[ix]).s0);
    _sse_su3_inverse_multiply((*U));
    _sse_vector_cmplxcg_mul(ka0);
    _sse_load(rs.s0);
    _sse_vector_add();
    _sse_store(rs.s0);

    _sse_load(rs.s2);
    _sse_vector_sub();
    _sse_store(rs.s2);

    _sse_load((*phi[ix]).s1);
    _sse_su3_inverse_multiply((*U));
    _sse_vector_cmplxcg_mul(ka0);

    _sse_load(rs.s1);
    _sse_vector_add();
    _sse_store(rs.s1);

    _sse_load(rs.s3);
    _sse_vector_sub();
    _sse_store(rs.s3);

    ix++;
    U++;
    /*********************** direction +1 ************************/
    _sse_load_up((*phi[ix]).s0);
    _sse_load(rs.s0);
    _sse_vector_add();
    _sse_store(rs.s0);

    _sse_load(rs.s3);
    _sse_vector_i_mul();      
    _sse_vector_sub();
    _sse_store(rs.s3); 

    _sse_load_up((*phi[ix]).s1);
    _sse_load(rs.s1);
    _sse_vector_add();
    _sse_store(rs.s1);

    _sse_load(rs.s2);
    _sse_vector_i_mul();      
    _sse_vector_sub();
    _sse_store(rs.s2);       
    ix++;

    /*********************** direction -1 ************************/

    _prefetch_su3(U+predist);

    _sse_load((*phi[ix]).s0);
    _sse_su3_inverse_multiply((*U));
    _sse_vector_cmplxcg_mul(ka1);

    _sse_load(rs.s0);
    _sse_vector_add();
    _sse_store(rs.s0);

    _sse_load(rs.s3);
    _sse_vector_i_mul();      
    _sse_vector_add();
    _sse_store(rs.s3);

    _sse_load((*phi[ix]).s1);
      
    _sse_su3_inverse_multiply((*U));
    _sse_vector_cmplxcg_mul(ka1);

    _sse_load(rs.s1);
    _sse_vector_add();
    _sse_store(rs.s1);

    _sse_load(rs.s2);
    _sse_vector_i_mul();      
    _sse_vector_add();
    _sse_store(rs.s2);
    ix++;
    U++;

    /*********************** direction +2 ************************/
    _sse_load_up((*phi[ix]).s0);
    _sse_load(rs.s0);
    _sse_vector_add();
    _sse_store(rs.s0);

    _sse_load(rs.s3);
    _sse_vector_add();
    _sse_store(rs.s3);

    _sse_load_up((*phi[ix]).s1);
    _sse_load(rs.s1);
    _sse_vector_add();
    _sse_store(rs.s1);

    _sse_load(rs.s2);
    _sse_vector_sub();
    _sse_store(rs.s2);      
    ix++;

    /*********************** direction -2 ************************/

    _prefetch_su3(U+predist);

    _sse_load((*phi[ix]).s0);
    _sse_su3_inverse_multiply((*U));
    _sse_vector_cmplxcg_mul(ka2);

    _sse_load(rs.s0);
    _sse_vector_add();
    _sse_store(rs.s0);

    _sse_load(rs.s3);
    _sse_vector_sub();
    _sse_store(rs.s3);

    _sse_load((*phi[ix]).s1);
      
    _sse_su3_inverse_multiply((*U));
    _sse_vector_cmplxcg_mul(ka2);

    _sse_load(rs.s1);
    _sse_vector_add();
    _sse_store(rs.s1);

    _sse_load(rs.s2);
    _sse_vector_add();
    _sse_store(rs.s2);      
    ix++;
    U++;
    /*********************** direction +3 ************************/
    _sse_load_up((*phi[ix]).s0);

    _sse_load(rs.s0);
    _sse_vector_add();
    _sse_store(rs.s0);

    _sse_load(rs.s2);
    _sse_vector_i_mul();      
    _sse_vector_sub();
    _sse_store(rs.s2);

    _sse_load_up((*phi[ix]).s1);

    _sse_load(rs.s1);
    _sse_vector_add();
    _sse_store(rs.s1);

    _sse_load(rs.s3);
    _sse_vector_i_mul();      
    _sse_vector_add();
    _sse_store(rs.s3);

    ix++;
    /*********************** direction -3 ************************/

    _prefetch_su3(U+predist); 
    _prefetch_spinor(s+1);

    _sse_load((*phi[ix]).s0);
      
    _sse_su3_inverse_multiply((*U));
    _sse_vector_cmplxcg_mul(ka3);

    _sse_load(rs.s0);
    _sse_vector_add();
    _sse_store_nt((*s).s0);

    _sse_load(rs.s2);
    _sse_vector_i_mul();      
    _sse_vector_add();
    _sse_store_nt((*s).s2);

    _sse_load((*phi[ix]).s1);
      
    _sse_su3_inverse_multiply((*U));
    _sse_vector_cmplxcg_mul(ka3);

    _sse_load(rs.s1);
    _sse_vector_add();
    _sse_store_nt((*s).s1);

    _sse_load(rs.s3);
    _sse_vector_i_mul();      
    _sse_vector_sub();
    _sse_store_nt((*s).s3);
    ix++;
    U++;
    s++;
  }
#ifdef _KOJAK_INST
#pragma pomp inst end(hoppingmatrix)
#endif
}
Beispiel #10
0
double square_norm(spinor * const P, const int N, const int parallel) {
  int ix=0;
  double res, res2;
  double *s ALIGN;
  double *sp ALIGN;
  double _Complex x00, x01, x02, x03, x04, x05, x06, x07, 
    x08, x09, x10, x11;
  double _Complex y00, y01, y02, y03, y04, y05, y06, y07, 
    y08, y09, y10, y11;

  __alignx(16, P);
  s = (double*)P;
  sp = s+24;
  _prefetch_spinor(sp);
  x00 = __lfpd(s);
  x01 = __lfpd(s+2);
  x02 = __lfpd(s+4);
  x03 = __lfpd(s+6);
  x04 = __lfpd(s+8);
  x05 = __lfpd(s+10);
  x06 = __lfpd(s+12);
  x07 = __lfpd(s+14);
  x08 = __lfpd(s+16);
  x09 = __lfpd(s+18);
  x10 = __lfpd(s+20);
  x11 = __lfpd(s+22);
  
  y00 = __fpmul(x00, x00);
  y01 = __fpmul(x01, x01);
  y02 = __fpmul(x02, x02);
  y03 = __fpmul(x03, x03);
  y04 = __fpmul(x04, x04);
  y05 = __fpmul(x05, x05);
  y06 = __fpmul(x06, x06);
  y07 = __fpmul(x07, x07);
  y08 = __fpmul(x08, x08);
  y09 = __fpmul(x09, x09);
  y10 = __fpmul(x10, x10);
  y11 = __fpmul(x11, x11);
  s = sp;


#pragma unroll(12)
  for(ix = 1; ix < N-1; ix++) {
    sp+=24;;
    _prefetch_spinor(sp);
    x00 = __lfpd(s);   
    x01 = __lfpd(s+2); 
    x02 = __lfpd(s+4); 
    x03 = __lfpd(s+6); 
    x04 = __lfpd(s+8); 
    x05 = __lfpd(s+10);
    x06 = __lfpd(s+12);
    x07 = __lfpd(s+14);
    x08 = __lfpd(s+16);
    x09 = __lfpd(s+18);
    x10 = __lfpd(s+20);
    x11 = __lfpd(s+22);
    y00 = __fpmadd(y00, x00, x00); 
    y01 = __fpmadd(y01, x01, x01); 
    y02 = __fpmadd(y02, x02, x02); 
    y03 = __fpmadd(y03, x03, x03); 
    y04 = __fpmadd(y04, x04, x04); 
    y05 = __fpmadd(y05, x05, x05); 
    y06 = __fpmadd(y06, x06, x06); 
    y07 = __fpmadd(y07, x07, x07); 
    y08 = __fpmadd(y08, x08, x08); 
    y09 = __fpmadd(y09, x09, x09); 
    y10 = __fpmadd(y10, x10, x10); 
    y11 = __fpmadd(y11, x11, x11); 
    s=sp;
  }
  x00 = __lfpd(s);   
  x01 = __lfpd(s+2); 
  x02 = __lfpd(s+4); 
  x03 = __lfpd(s+6); 
  x04 = __lfpd(s+8); 
  x05 = __lfpd(s+10);
  x06 = __lfpd(s+12);
  x07 = __lfpd(s+14);
  x08 = __lfpd(s+16);
  x09 = __lfpd(s+18);
  x10 = __lfpd(s+20);
  x11 = __lfpd(s+22);
  y00 = __fpmadd(y00, x00, x00); 
  y01 = __fpmadd(y01, x01, x01); 
  y02 = __fpmadd(y02, x02, x02); 
  y03 = __fpmadd(y03, x03, x03); 
  y04 = __fpmadd(y04, x04, x04); 
  y05 = __fpmadd(y05, x05, x05); 
  y06 = __fpmadd(y06, x06, x06); 
  y07 = __fpmadd(y07, x07, x07); 
  y08 = __fpmadd(y08, x08, x08); 
  y09 = __fpmadd(y09, x09, x09); 
  y10 = __fpmadd(y10, x10, x10); 
  y11 = __fpmadd(y11, x11, x11); 
  
  y00 = __fpadd(y00, y01);
  y02 = __fpadd(y02, y03);
  y04 = __fpadd(y04, y05);
  y06 = __fpadd(y06, y07);
  y08 = __fpadd(y08, y09);
  y10 = __fpadd(y10, y11);
  y00 = __fpadd(y00, y02);
  y04 = __fpadd(y04, y06);
  y08 = __fpadd(y08, y10);
  y00 = __fpadd(y00, y04);
  y00 = __fpadd(y00, y08);
  res = __creal(y00)+__cimag(y00);
#  ifdef TM_USE_MPI
  if(parallel) {
    MPI_Allreduce(&res, &res2, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    return res2;
  }
#  endif
  return res;
}