예제 #1
0
void assign_add_mul_r_32(spinor32 * const R, spinor32 * const S, const float c, const int N) {
#ifdef TM_USE_OMP
#pragma omp parallel
  {
#endif
  vector4double x0, x1, x2, x3, x4, x5, y0, y1, y2, y3, y4, y5;
  vector4double z0, z1, z2, z3, z4, z5, k;
  float *s, *r;
  float ALIGN32 _c;
  _c = c;
  __prefetch_by_load(S);
  __prefetch_by_load(R);

  k = vec_splats((double)_c);
  __alignx(16, s);
  __alignx(16, r);
  __alignx(16, S);
  __alignx(16, R);

#ifdef TM_USE_OMP
#pragma omp for
#else
#pragma unroll(2)
#endif
  for(int i = 0; i < N; i++) {
    s=(float*)((spinor32 *) S + i);
    r=(float*)((spinor32 *) R + i);
    __prefetch_by_load(S + i + 1);
    __prefetch_by_stream(1, R + i + 1);
    x0 = vec_ld(0, r);
    x1 = vec_ld(0, r+4);
    x2 = vec_ld(0, r+8);
    x3 = vec_ld(0, r+12);
    x4 = vec_ld(0, r+16);
    x5 = vec_ld(0, r+20);
    y0 = vec_ld(0, s);
    y1 = vec_ld(0, s+4);
    y2 = vec_ld(0, s+8);
    y3 = vec_ld(0, s+12);
    y4 = vec_ld(0, s+16);
    y5 = vec_ld(0, s+20);
    z0 = vec_madd(k, y0, x0);
    z1 = vec_madd(k, y1, x1);
    z2 = vec_madd(k, y2, x2);
    z3 = vec_madd(k, y3, x3);
    z4 = vec_madd(k, y4, x4);
    z5 = vec_madd(k, y5, x5);
    vec_st(z0, 0, r);
    vec_st(z1, 0, r+4);
    vec_st(z2, 0, r+8);
    vec_st(z3, 0, r+12);
    vec_st(z4, 0, r+16);
    vec_st(z5, 0, r+20);
  }
#ifdef TM_USE_OMP
  } /* OpenMP closing brace */
#endif
  return;
}
예제 #2
0
double scalar_prod_r(spinor * const S,spinor * const R, const int N, const int parallel){
  int ix;
  static double ks,kc,ds,tr,ts,tt;
  spinor *s,*r;
  ks=0.0;
  kc=0.0;

#if (defined BGL && defined XLC)
  __alignx(16, S);
  __alignx(16, R);
#endif
  
  for (ix=0;ix<N;ix++){
    s = (spinor *) S + ix;
    r = (spinor *) R + ix;
    
    ds=(*r).s0.c0.re*(*s).s0.c0.re + (*r).s0.c0.im*(*s).s0.c0.im + 
       (*r).s0.c1.re*(*s).s0.c1.re + (*r).s0.c1.im*(*s).s0.c1.im + 
       (*r).s0.c2.re*(*s).s0.c2.re + (*r).s0.c2.im*(*s).s0.c2.im + 
       (*r).s1.c0.re*(*s).s1.c0.re + (*r).s1.c0.im*(*s).s1.c0.im + 
       (*r).s1.c1.re*(*s).s1.c1.re + (*r).s1.c1.im*(*s).s1.c1.im + 
       (*r).s1.c2.re*(*s).s1.c2.re + (*r).s1.c2.im*(*s).s1.c2.im + 
       (*r).s2.c0.re*(*s).s2.c0.re + (*r).s2.c0.im*(*s).s2.c0.im + 
       (*r).s2.c1.re*(*s).s2.c1.re + (*r).s2.c1.im*(*s).s2.c1.im + 
       (*r).s2.c2.re*(*s).s2.c2.re + (*r).s2.c2.im*(*s).s2.c2.im + 
       (*r).s3.c0.re*(*s).s3.c0.re + (*r).s3.c0.im*(*s).s3.c0.im + 
       (*r).s3.c1.re*(*s).s3.c1.re + (*r).s3.c1.im*(*s).s3.c1.im + 
       (*r).s3.c2.re*(*s).s3.c2.re + (*r).s3.c2.im*(*s).s3.c2.im;
    
    tr = ds + kc;
    ts = tr + ks;
    tt = ts-ks;
    ks = ts;
    kc = tr-tt;
  }
  kc = ks + kc;

#if defined MPI
  if(parallel) {
    MPI_Allreduce(&kc, &ks, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    return ks;
  }
#endif

  return kc;

}
예제 #3
0
double scalar_prod_r(spinor * const S, spinor * const R, const int N, const int parallel)
{
  
  static double ks,kc,ds,tr,ts,tt;
  spinor *s,*r;
  
  ks=0.0;
  kc=0.0;
  
#if (defined BGL && defined XLC)
  __alignx(16, S);
  __alignx(16, R);
#endif

  for (int ix = 0; ix < N; ++ix)
  {
    s=(spinor *) S + ix;
    r=(spinor *) R + ix;
    
    ds = r->s0.c0 * conj(s->s0.c0) + r->s0.c1 * conj(s->s0.c1) + r->s0.c2 * conj(s->s0.c2) +
         r->s1.c0 * conj(s->s1.c0) + r->s1.c1 * conj(s->s1.c1) + r->s1.c2 * conj(s->s1.c2) +
	 r->s2.c0 * conj(s->s2.c0) + r->s2.c1 * conj(s->s2.c1) + r->s2.c2 * conj(s->s2.c2) +
         r->s3.c0 * conj(s->s3.c0) + r->s3.c1 * conj(s->s3.c1) + r->s3.c2 * conj(s->s3.c2);
    
    tr=ds+kc;
    ts=tr+ks;
    tt=ts-ks;
    ks=ts;
    kc=tr-tt;
  }
  kc=ks+kc;

#if defined MPI
  if(parallel)
  {
    double buffer = kc;
    MPI_Allreduce(&buffer, &kc, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  }
#endif

  return kc;

}
예제 #4
0
/* 3. */
void xchange_halffield() {
#  ifdef MPI

  MPI_Status status[16];
#    ifdef PARALLELT
  int reqcount = 4;
#    elif defined PARALLELXT
  int reqcount = 8;
#    elif defined PARALLELXYT
  int reqcount = 12;
#    elif defined PARALLELXYZT
  int x0=0, x1=0, x2=0, ix=0;
  int reqcount = 16;
#    endif
#    if (defined XLC && defined BGL)
  __alignx(16, HalfSpinor);
#    endif
  MPI_Startall(reqcount, prequests);

  MPI_Waitall(reqcount, prequests, status); 
#  endif /* MPI */
  return;
}
예제 #5
0
void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){
  int ix;
  su3 * restrict ALIGN U;
  spinor * restrict ALIGN s;
  halfspinor * restrict * phi ALIGN;
  halfspinor32 * restrict * phi32 ALIGN;
  /* We have 32 registers available */
  _declare_hregs();

#ifdef _KOJAK_INST
#pragma pomp inst begin(hoppingmatrix)
#endif
#pragma disjoint(*s, *U)

#ifdef _GAUGE_COPY
  if(g_update_gauge_copy) {
    update_backward_gauge(g_gauge_field);
  }
#endif

  __alignx(16, l);
  __alignx(16, k);
  if(g_sloppy_precision == 1 && g_sloppy_precision_flag == 1) {
    __alignx(16, HalfSpinor32);
    /* We will run through the source vector now */
    /* instead of the solution vector            */
    s = k;
    _prefetch_spinor(s); 

    /* s contains the source vector */

    if(ieo == 0) {
      U = g_gauge_field_copy[0][0];
    }
    else {
      U = g_gauge_field_copy[1][0];
    }
    phi32 = NBPointer32[ieo];

    _prefetch_su3(U);
    /**************** loop over all lattice sites ******************/
    ix=0;
    for(int i = 0; i < (VOLUME)/2; i++){
      /*********************** direction +0 ************************/
      _hop_t_p_pre32();
      s++; 
      U++;
      ix++;

      /*********************** direction -0 ************************/
      _hop_t_m_pre32();
      ix++;

      /*********************** direction +1 ************************/
      _hop_x_p_pre32();
      ix++;
      U++;

      /*********************** direction -1 ************************/
      _hop_x_m_pre32();
      ix++;

      /*********************** direction +2 ************************/
      _hop_y_p_pre32();

      ix++;
      U++;

      /*********************** direction -2 ************************/
      _hop_y_m_pre32();
      ix++;

      /*********************** direction +3 ************************/
      _hop_z_p_pre32();
      ix++;
      U++;

      /*********************** direction -3 ************************/
      _hop_z_m_pre32();
      ix++;

      /************************ end of loop ************************/
    }

#    if (defined TM_USE_MPI && !defined _NO_COMM)
    xchange_halffield32(); 
#    endif
    s = l;
    phi32 = NBPointer32[2 + ieo];
    if(ieo == 0) {
      U = g_gauge_field_copy[1][0];
    }
    else {
      U = g_gauge_field_copy[0][0];
    }
    //_prefetch_halfspinor(phi32[0]);
    _prefetch_su3(U);
  
    /* Now we sum up and expand to a full spinor */
    ix = 0;
    /*   _prefetch_spinor_for_store(s); */
    for(int i = 0; i < (VOLUME)/2; i++){
      /* This causes a lot of trouble, do we understand this? */
      /*     _prefetch_spinor_for_store(s); */
      //_prefetch_halfspinor(phi32[ix+1]);
      /*********************** direction +0 ************************/
      _hop_t_p_post32();
      ix++;
      /*********************** direction -0 ************************/
      _hop_t_m_post32();
      U++;
      ix++;
      /*********************** direction +1 ************************/
      _hop_x_p_post32();
      ix++;
      /*********************** direction -1 ************************/
      _hop_x_m_post32();
      U++;
      ix++;
      /*********************** direction +2 ************************/
      _hop_y_p_post32();
      ix++;
      /*********************** direction -2 ************************/
      _hop_y_m_post32();
      U++;
      ix++;
      /*********************** direction +3 ************************/
      _hop_z_p_post32();
      ix++;
      /*********************** direction -3 ************************/
      _hop_z_m_post32();
      U++;
      ix++;
      s++;
    }
  }
  else {
    __alignx(16, HalfSpinor);
    /* We will run through the source vector now */
    /* instead of the solution vector            */
    s = k;
    _prefetch_spinor(s); 

    /* s contains the source vector */

    if(ieo == 0) {
      U = g_gauge_field_copy[0][0];
    }
    else {
      U = g_gauge_field_copy[1][0];
    }
    phi = NBPointer[ieo];

    _prefetch_su3(U);
    /**************** loop over all lattice sites ******************/
    ix=0;
    for(int i = 0; i < (VOLUME)/2; i++){
      /*********************** direction +0 ************************/
      _hop_t_p_pre();
      s++; 
      U++;
      ix++;

      /*********************** direction -0 ************************/
      _hop_t_m_pre();
      ix++;

      /*********************** direction +1 ************************/
      _hop_x_p_pre();
      ix++;
      U++;

      /*********************** direction -1 ************************/
      _hop_x_m_pre();
      ix++;


      /*********************** direction +2 ************************/
      _hop_y_p_pre();
      ix++;
      U++;

      /*********************** direction -2 ************************/
      _hop_y_m_pre();
      ix++;

      /*********************** direction +3 ************************/
      _hop_z_p_pre();
      ix++;
      U++;

      /*********************** direction -3 ************************/
      _hop_z_m_pre();
      ix++;

      /************************ end of loop ************************/

    }

#    if (defined TM_USE_MPI && !defined _NO_COMM)
    xchange_halffield(); 
#    endif
    s = l;
    phi = NBPointer[2 + ieo];
    //_prefetch_halfspinor(phi[0]);
    if(ieo == 0) {
      U = g_gauge_field_copy[1][0];
    }
    else {
      U = g_gauge_field_copy[0][0];
    }
    _prefetch_su3(U);
  
    /* Now we sum up and expand to a full spinor */
    ix = 0;
    /*   _prefetch_spinor_for_store(s); */
    for(int i = 0; i < (VOLUME)/2; i++){
      /* This causes a lot of trouble, do we understand this? */
      /*     _prefetch_spinor_for_store(s); */
      //_prefetch_halfspinor(phi[ix+1]);
      /*********************** direction +0 ************************/
      _hop_t_p_post();
      ix++;
      /*********************** direction -0 ************************/
      _hop_t_m_post();
      U++;
      ix++;
      /*********************** direction +1 ************************/
      _hop_x_p_post();
      ix++;
      /*********************** direction -1 ************************/
      _hop_x_m_post();
      U++;
      ix++;
      /*********************** direction +2 ************************/
      _hop_y_p_post();
      ix++;
      /*********************** direction -2 ************************/
      _hop_y_m_post();
      U++;
      ix++;
      /*********************** direction +3 ************************/
      _hop_z_p_post();
      ix++;
      /*********************** direction -3 ************************/
      _hop_z_m_post();
      U++;
      ix++;
      s++;
    }
  }
#ifdef _KOJAK_INST
#pragma pomp inst end(hoppingmatrix)
#endif
}
예제 #6
0
/* 2. */
void init_xchange_halffield() {

#  ifdef MPI

#  ifdef PARALLELT
  int reqcount = 4;
#  elif defined PARALLELXT
  int reqcount = 8;
#  elif defined PARALLELXYT
  int reqcount = 12;
#  elif defined PARALLELXYZT
  int x0=0, x1=0, x2=0, ix=0;
  int reqcount = 16;
#  endif
#  if (defined XLC && defined BGL)
  __alignx(16, HalfSpinor);
#  endif

  /* send the data to the neighbour on the right in t direction */
  /* recieve the data from the neighbour on the left in t direction */
  MPI_Send_init((void*)(HalfSpinor + 4*VOLUME), LX*LY*LZ*12/2, MPI_DOUBLE, 
		g_nb_t_up, 81, g_cart_grid, &prequests[0]);
  
  MPI_Recv_init((void*)(HalfSpinor + 4*VOLUME + RAND/2 + LX*LY*LZ/2), LX*LY*LZ*12/2, MPI_DOUBLE, 
		g_nb_t_dn, 81, g_cart_grid, &prequests[1]);

  /* send the data to the neighbour on the left in t direction */
  /* recieve the data from the neighbour on the right in t direction */
  MPI_Send_init((void*)(HalfSpinor + 4*VOLUME + LX*LY*LZ/2), LX*LY*LZ*12/2, MPI_DOUBLE, 
		g_nb_t_dn, 82, g_cart_grid, &prequests[2]);

  MPI_Recv_init((void*)(HalfSpinor + 4*VOLUME + RAND/2), LX*LY*LZ*12/2, MPI_DOUBLE, 
		g_nb_t_up, 82, g_cart_grid, &prequests[3]);

#    if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)

  /* send the data to the neighbour on the right in x direction */
  /* recieve the data from the neighbour on the left in x direction */
  MPI_Send_init((void*)(HalfSpinor + 4*VOLUME + LX*LY*LZ), T*LY*LZ*12/2, MPI_DOUBLE, 
		g_nb_x_up, 91, g_cart_grid, &prequests[4]);

  MPI_Recv_init((void*)(HalfSpinor + 4*VOLUME + RAND/2 + LX*LY*LZ + T*LY*LZ/2), T*LY*LZ*12/2, MPI_DOUBLE,
		g_nb_x_dn, 91, g_cart_grid, &prequests[5]);

  /* send the data to the neighbour on the left in x direction */
  /* recieve the data from the neighbour on the right in x direction */  
  MPI_Send_init((void*)(HalfSpinor + 4*VOLUME + LX*LY*LZ + T*LY*LZ/2), T*LY*LZ*12/2, MPI_DOUBLE,
		g_nb_x_dn, 92, g_cart_grid, &prequests[6]);

  MPI_Recv_init((void*)(HalfSpinor + 4*VOLUME + RAND/2 + LX*LY*LZ), T*LY*LZ*12/2, MPI_DOUBLE,
		g_nb_x_up, 92, g_cart_grid, &prequests[7]);
#    endif
    
#    if (defined PARALLELXYT || defined PARALLELXYZT)
    /* send the data to the neighbour on the right in y direction */
    /* recieve the data from the neighbour on the left in y direction */
  MPI_Send_init((void*)(HalfSpinor + 4*VOLUME + LX*LY*LZ + T*LY*LZ), T*LX*LZ*12/2, MPI_DOUBLE, 
		g_nb_y_up, 101, g_cart_grid, &prequests[8]);

  MPI_Recv_init((void*)(HalfSpinor + 4*VOLUME + RAND/2 + LX*LY*LZ + T*LY*LZ + T*LX*LZ/2), T*LX*LZ*12/2, MPI_DOUBLE, 
		g_nb_y_dn, 101, g_cart_grid, &prequests[9]);

    /* send the data to the neighbour on the leftt in y direction */
    /* recieve the data from the neighbour on the right in y direction */
  MPI_Send_init((void*)(HalfSpinor + 4*VOLUME + LX*LY*LZ + T*LY*LZ + T*LX*LZ/2), T*LX*LZ*12/2, MPI_DOUBLE, 
		g_nb_y_dn, 102, g_cart_grid, &prequests[10]);

  MPI_Recv_init((void*)(HalfSpinor + 4*VOLUME + RAND/2 + LX*LY*LZ + T*LY*LZ), T*LX*LZ*12/2, MPI_DOUBLE, 
		g_nb_y_up, 102, g_cart_grid, &prequests[11]);
#    endif
    
#    if (defined PARALLELXYZT)
  /* send the data to the neighbour on the right in z direction */
  /* recieve the data from the neighbour on the left in z direction */
  MPI_Send_init((void*)(HalfSpinor + 4*VOLUME + LX*LY*LZ + T*LY*LZ + T*LX*LZ), 
		T*LX*LY*12/2, MPI_DOUBLE, g_nb_z_up, 503, g_cart_grid, &prequests[12]); 

  MPI_Recv_init((void*)(HalfSpinor + 4*VOLUME + RAND/2 + LX*LY*LZ + T*LY*LZ + T*LX*LZ + T*LX*LY/2), 
		T*LX*LY*12/2, MPI_DOUBLE, g_nb_z_dn, 503, g_cart_grid, &prequests[13]); 

  /* send the data to the neighbour on the left in z direction */
  /* recieve the data from the neighbour on the right in z direction */
  MPI_Send_init((void*)(HalfSpinor + 4*VOLUME + LX*LY*LZ + T*LY*LZ + T*LX*LZ + T*LX*LY/2), 
		12*T*LX*LY/2, MPI_DOUBLE, g_nb_z_dn, 504, g_cart_grid, &prequests[14]); 

  MPI_Recv_init((void*)(HalfSpinor + 4*VOLUME + RAND/2 + LX*LY*LZ + T*LY*LZ + T*LX*LZ), 
		T*LX*LY*12/2, MPI_DOUBLE, g_nb_z_up, 504, g_cart_grid, &prequests[15]); 
#  endif
#  endif /* MPI */
  return;
}
예제 #7
0
void xchange_lexicfield32(spinor32 * const l) {

  MPI_Request requests[16];
  MPI_Status status[16];
#  ifdef PARALLELT
  int reqcount = 4;
#  elif defined PARALLELXT
  int reqcount = 8;
#  elif defined PARALLELXYT
  int reqcount = 12;
#  elif defined PARALLELXYZT
  int reqcount = 16;
#  endif
#ifdef _KOJAK_INST
#pragma pomp inst begin(xchange_lexicfield32)
#endif
#  if (defined BGL && defined XLC)
  __alignx(16, l);
#  endif

#  ifdef TM_USE_MPI


  /* send the data to the neighbour on the left */
  /* recieve the data from the neighbour on the right */
  MPI_Isend((void*)l, 1, lfield_time_slice_cont32, g_nb_t_dn, 5081, g_cart_grid, &requests[0]);
  MPI_Irecv((void*)(l+VOLUME), 1, lfield_time_slice_cont32, g_nb_t_up, 5081, g_cart_grid, &requests[1]);
#    if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
  /* send the data to the neighbour on the left in x direction */
  /* recieve the data from the neighbour on the right in x direction */
  MPI_Isend((void*)l, 1, lfield_x_slice_gath32, g_nb_x_dn, 5091, g_cart_grid,  &requests[4]);
  MPI_Irecv((void*)(l+(T+2)*LX*LY*LZ), 1, lfield_x_slice_cont32, g_nb_x_up, 5091, g_cart_grid, &requests[5]);

#    endif
  
#    if (defined PARALLELXYT || defined PARALLELXYZT)
  /* send the data to the neighbour on the left in y direction */
  /* recieve the data from the neighbour on the right in y direction */
  MPI_Isend((void*)l, 1, lfield_y_slice_gath32, g_nb_y_dn, 5101, g_cart_grid, &requests[8]);
  MPI_Irecv((void*)(l + VOLUME + 2*LZ*(LX*LY + T*LY)), 1, lfield_y_slice_cont32, g_nb_y_up, 5101, g_cart_grid, &requests[9]);
#    endif
  
#    if (defined PARALLELXYZT)
  
  /* send the data to the neighbour on the left in z direction */
  /* recieve the data from the neighbour on the right in z direction */
  MPI_Isend((void*)l, 1, lfield_z_slice_gath32, g_nb_z_dn, 5503, g_cart_grid, &requests[12]);
  MPI_Irecv((void*)(l+VOLUME + 2*LZ*(LX*LY + T*LY) + 2*LZ*T*LX), 1, lfield_z_slice_cont32, g_nb_z_up, 5503, g_cart_grid, &requests[13]); 
#    endif
  /* send the data to the neighbour on the right */
  /* recieve the data from the neighbour on the left */
  MPI_Isend((void*)(l+(T-1)*LX*LY*LZ), 1, lfield_time_slice_cont32, g_nb_t_up, 5082, g_cart_grid, &requests[2]);
  MPI_Irecv((void*)(l+(T+1)*LX*LY*LZ), 1, lfield_time_slice_cont32, g_nb_t_dn, 5082, g_cart_grid, &requests[3]);
  
#    if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
  /* send the data to the neighbour on the right in x direction */
  /* recieve the data from the neighbour on the left in x direction */  
  MPI_Isend((void*)(l+(LX-1)*LY*LZ), 1, lfield_x_slice_gath32, g_nb_x_up, 5092, g_cart_grid, &requests[6]);
  MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + T*LY*LZ)), 1, lfield_x_slice_cont32, g_nb_x_dn, 5092, g_cart_grid, &requests[7]);
#    endif
  
#    if (defined PARALLELXYT || defined PARALLELXYZT)
  /* send the data to the neighbour on the right in y direction */
  /* recieve the data from the neighbour on the left in y direction */  
  MPI_Isend((void*)(l+(LY-1)*LZ), 1, lfield_y_slice_gath32, g_nb_y_up, 5102, g_cart_grid, &requests[10]);
  MPI_Irecv((void*)(l+VOLUME + 2*LZ*(LX*LY + T*LY) + T*LX*LZ), 1, lfield_y_slice_cont32, g_nb_y_dn, 5102, g_cart_grid, &requests[11]);
#    endif
  
#    if defined PARALLELXYZT
  
  /* send the data to the neighbour on the right in y direction */
  /* recieve the data from the neighbour on the left in y direction */  
  MPI_Isend((void*)(l+LZ-1), 1, lfield_z_slice_gath32, g_nb_z_up, 5504, g_cart_grid, &requests[14]);
  MPI_Irecv((void*)(l+VOLUME + 2*LZ*(LX*LY + T*LY) + 2*T*LX*LZ + T*LX*LY), 1, lfield_z_slice_cont32, g_nb_z_dn, 5504, g_cart_grid, &requests[15]); 
#    endif
  
  MPI_Waitall(reqcount, requests, status);

#  endif
  return;
#ifdef _KOJAK_INST
#pragma pomp inst end(xchange_lexicfield32)
#endif
}
예제 #8
0
void xchange_lexicfield(spinor * const l) {

#ifdef TM_USE_MPI
  MPI_Request requests[16];
  MPI_Status status[16];
#endif
  int ireq;
#  if ( defined PARALLELT || defined PARALLELX )
  int reqcount = 4;
#  elif ( defined PARALLELXT || defined PARALLELXY )
  int reqcount = 8;
#  elif ( defined PARALLELXYT || defined PARALLELXYZ )
  int reqcount = 12;
#  elif defined PARALLELXYZT
  int ix=0;
  int reqcount = 16;
#  endif

#ifdef _KOJAK_INST
#pragma pomp inst begin(xchange_lexicfield)
#endif
#  if (defined BGL && defined XLC)
  __alignx(16, l);
#  endif

#  ifdef TM_USE_MPI


  ireq=0;

#    if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT )
  /* send the data to the neighbour on the left */
  /* recieve the data from the neighbour on the right */
  MPI_Isend((void*)(l+gI_0_0_0_0), 1, lfield_time_slice_cont, g_nb_t_dn, 5081, g_cart_grid, &requests[ireq]);
  MPI_Irecv((void*)(l+gI_L_0_0_0), 1, lfield_time_slice_cont, g_nb_t_up, 5081, g_cart_grid, &requests[ireq+1]);
  ireq=ireq+4;
#    endif


#    if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ )
  /* send the data to the neighbour on the left in x direction */
  /* recieve the data from the neighbour on the right in x direction */
  MPI_Isend((void*)(l+gI_0_0_0_0), 1, lfield_x_slice_gath, g_nb_x_dn, 5091, g_cart_grid,  &requests[ireq]);
  MPI_Irecv((void*)(l+gI_0_L_0_0), 1, lfield_x_slice_cont, g_nb_x_up, 5091, g_cart_grid, &requests[ireq+1]);
  ireq=ireq+4;
#    endif
  
#    if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ )
  /* send the data to the neighbour on the left in y direction */
  /* recieve the data from the neighbour on the right in y direction */
  MPI_Isend((void*)(l+gI_0_0_0_0), 1, lfield_y_slice_gath, g_nb_y_dn, 5101, g_cart_grid, &requests[ireq]);
  MPI_Irecv((void*)(l+gI_0_0_L_0), 1, lfield_y_slice_cont, g_nb_y_up, 5101, g_cart_grid, &requests[ireq+1]);
  ireq=ireq+4;
#    endif
  
#    if (defined PARALLELXYZT || defined PARALLELXYZ )  
  /* send the data to the neighbour on the left in z direction */
  /* recieve the data from the neighbour on the right in z direction */
  MPI_Isend((void*)(l+gI_0_0_0_0), 1, lfield_z_slice_gath, g_nb_z_dn, 5503, g_cart_grid, &requests[ireq]);
  MPI_Irecv((void*)(l+gI_0_0_0_L), 1, lfield_z_slice_cont, g_nb_z_up, 5503, g_cart_grid, &requests[ireq+1]); 
  ireq=ireq+4;
#    endif

  ireq=2;

#    if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT )
  /* send the data to the neighbour on the right */
  /* recieve the data from the neighbour on the left */
  MPI_Isend((void*)(l+gI_Lm1_0_0_0), 1, lfield_time_slice_cont, g_nb_t_up, 5082, g_cart_grid, &requests[ireq]);
  MPI_Irecv((void*)(l+gI_m1_0_0_0), 1, lfield_time_slice_cont, g_nb_t_dn, 5082, g_cart_grid, &requests[ireq+1]);
  ireq=ireq+4;
#endif
  
#    if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ )
  /* send the data to the neighbour on the right in x direction */
  /* recieve the data from the neighbour on the left in x direction */  
  MPI_Isend((void*)(l+gI_0_Lm1_0_0), 1, lfield_x_slice_gath, g_nb_x_up, 5092, g_cart_grid, &requests[ireq]);
  MPI_Irecv((void*)(l+gI_0_m1_0_0), 1, lfield_x_slice_cont, g_nb_x_dn, 5092, g_cart_grid, &requests[ireq+1]);
  ireq=ireq+4;
#    endif
  
#    if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ )
  /* send the data to the neighbour on the right in y direction */
  /* recieve the data from the neighbour on the left in y direction */  
  MPI_Isend((void*)(l+gI_0_0_Lm1_0), 1, lfield_y_slice_gath, g_nb_y_up, 5102, g_cart_grid, &requests[ireq]);
  MPI_Irecv((void*)(l+gI_0_0_m1_0), 1, lfield_y_slice_cont, g_nb_y_dn, 5102, g_cart_grid, &requests[ireq+1]);
  ireq=ireq+4;
#    endif
  
#    if ( defined PARALLELXYZT || defined PARALLELXYZ )  
  /* send the data to the neighbour on the right in y direction */
  /* recieve the data from the neighbour on the left in y direction */  
  MPI_Isend((void*)(l+gI_0_0_0_Lm1), 1, lfield_z_slice_gath, g_nb_z_up, 5504, g_cart_grid, &requests[ireq]);
  MPI_Irecv((void*)(l+gI_0_0_0_m1), 1, lfield_z_slice_cont, g_nb_z_dn, 5504, g_cart_grid, &requests[ireq+1]); 
#    endif
  
  MPI_Waitall(reqcount, requests, status);

#  endif
  return;
#ifdef _KOJAK_INST
#pragma pomp inst end(xchange_lexicfield)
#endif
}
예제 #9
0
void deriv_Sb_D_psi(spinor * const l, spinor * const k) {
/* const int l, const int k){ */
  int ix,iy;
  su3 * restrict up ALIGN;
  su3 * restrict um ALIGN;
/*   su3adj * restrict ddd; */
/*   static su3adj der; */
  static su3 v1,v2;
  static su3_vector psia,psib,phia,phib;
  static spinor rr;
/*   spinor * restrict r ALIGN; */
  spinor * restrict sp ALIGN;
  spinor * restrict sm ALIGN;

#ifdef _KOJAK_INST
#pragma pomp inst begin(derivSb)
#endif
#ifdef XLC
#pragma disjoint(*sp, *sm, *up, *um)
#endif

#ifdef BGL
  __alignx(16, l);
  __alignx(16, k);
#endif

  /* for parallelization */
#ifdef MPI
  xchange_lexicfield(k);
  xchange_lexicfield(l);
#endif
  /************** loop over all lattice sites ****************/

  for(ix = 0; ix < (VOLUME); ix++){
    rr = (*(l + ix));
    /*     rr=g_spinor_field[l][icx-ioff]; */

    /*multiply the left vector with gamma5*/
    _vector_minus_assign(rr.s2, rr.s2);
    _vector_minus_assign(rr.s3, rr.s3);

    /*********************** direction +0 ********************/

    iy=g_iup[ix][0];

    sp = k + iy;
/*     sp=&g_spinor_field[k][icy]; */
    up=&g_gauge_field[ix][0];
      
    _vector_add(psia,(*sp).s0,(*sp).s2);
    _vector_add(psib,(*sp).s1,(*sp).s3);
      
    _vector_add(phia,rr.s0,rr.s2);
    _vector_add(phib,rr.s1,rr.s3);

    _vector_tensor_vector_add(v1, phia, psia, phib, psib);
/*     _vector_tensor_vector(v1,phia,psia); */
/*     _vector_tensor_vector(v2,phib,psib); */
/*     _su3_plus_su3(v1,v1,v2); */
    _su3_times_su3d(v2,*up,v1);
    _complex_times_su3(v1,ka0,v2);
    _trace_lambda_add_assign(df0[ix][0], v1);
/*     _trace_lambda(der,v1); */
/*     ddd=&df0[ix][0]; */
/*     _add_su3adj(*ddd,der); */

    /************** direction -0 ****************************/

    iy=g_idn[ix][0];

    sm = k + iy;
/*     sm=&g_spinor_field[k][icy]; */
    um=&g_gauge_field[iy][0];
      
    _vector_sub(psia,(*sm).s0,(*sm).s2);
    _vector_sub(psib,(*sm).s1,(*sm).s3);

    _vector_sub(phia,rr.s0,rr.s2);
    _vector_sub(phib,rr.s1,rr.s3);

    _vector_tensor_vector_add(v1, psia, phia, psib, phib);
/*     _vector_tensor_vector(v1,psia,phia); */
/*     _vector_tensor_vector(v2,psib,phib); */
/*     _su3_plus_su3(v1,v1,v2); */
    _su3_times_su3d(v2,*um,v1);
    _complex_times_su3(v1,ka0,v2);
    _trace_lambda_add_assign(df0[iy][0], v1);
/*     _trace_lambda(der,v1); */
/*     ddd=&df0[iy][0]; */
/*     _add_su3adj(*ddd,der); */

    /*************** direction +1 **************************/

    iy=g_iup[ix][1];

    sp = k + iy;
    /*     sp=&g_spinor_field[k][icy]; */
    up=&g_gauge_field[ix][1];      

    _vector_i_add(psia,(*sp).s0,(*sp).s3);
    _vector_i_add(psib,(*sp).s1,(*sp).s2);

    _vector_i_add(phia,rr.s0,rr.s3);
    _vector_i_add(phib,rr.s1,rr.s2);

    _vector_tensor_vector_add(v1, phia, psia, phib, psib);
/*     _vector_tensor_vector(v1,phia,psia); */
/*     _vector_tensor_vector(v2,phib,psib); */
/*     _su3_plus_su3(v1,v1,v2); */
    _su3_times_su3d(v2,*up,v1);
    _complex_times_su3(v1,ka1,v2);
    _trace_lambda_add_assign(df0[ix][1], v1);
/*     _trace_lambda(der,v1); */
/*     ddd=&df0[ix][1]; */
/*     _add_su3adj(*ddd,der); */

    /**************** direction -1 *************************/

    iy=g_idn[ix][1];

    sm = k + iy;
    /*     sm=&g_spinor_field[k][icy]; */
    um=&g_gauge_field[iy][1];
      
    _vector_i_sub(psia,(*sm).s0,(*sm).s3);
    _vector_i_sub(psib,(*sm).s1,(*sm).s2);

    _vector_i_sub(phia,rr.s0,rr.s3);
    _vector_i_sub(phib,rr.s1,rr.s2);

    _vector_tensor_vector_add(v1, psia, phia, psib, phib);
/*     _vector_tensor_vector(v1,psia,phia); */
/*     _vector_tensor_vector(v2,psib,phib); */
/*     _su3_plus_su3(v1,v1,v2); */
    _su3_times_su3d(v2,*um,v1);
    _complex_times_su3(v1,ka1,v2);
    _trace_lambda_add_assign(df0[iy][1], v1);
/*     _trace_lambda(der,v1); */
/*     ddd=&df0[iy][1]; */
/*     _add_su3adj(*ddd,der); */

    /*************** direction +2 **************************/

    iy=g_iup[ix][2];

    sp = k + iy;
    /*     sp=&g_spinor_field[k][icy]; */
    up=&g_gauge_field[ix][2];
      
    _vector_add(psia,(*sp).s0,(*sp).s3);
    _vector_sub(psib,(*sp).s1,(*sp).s2);
      
    _vector_add(phia,rr.s0,rr.s3);
    _vector_sub(phib,rr.s1,rr.s2);

    _vector_tensor_vector_add(v1, phia, psia, phib, psib);
/*     _vector_tensor_vector(v1,phia,psia); */
/*     _vector_tensor_vector(v2,phib,psib); */
/*     _su3_plus_su3(v1,v1,v2); */
    _su3_times_su3d(v2,*up,v1);
    _complex_times_su3(v1,ka2,v2);
    _trace_lambda_add_assign(df0[ix][2], v1);
/*     _trace_lambda(der,v1); */
/*     ddd=&df0[ix][2]; */
/*     _add_su3adj(*ddd,der); */

    /***************** direction -2 ************************/

    iy=g_idn[ix][2];

    sm = k + iy;
    /*     sm=&g_spinor_field[k][icy]; */
    um=&g_gauge_field[iy][2];
      
    _vector_sub(psia,(*sm).s0,(*sm).s3);
    _vector_add(psib,(*sm).s1,(*sm).s2);

    _vector_sub(phia,rr.s0,rr.s3);
    _vector_add(phib,rr.s1,rr.s2);

    _vector_tensor_vector_add(v1, psia, phia, psib, phib);
/*     _vector_tensor_vector(v1,psia,phia); */
/*     _vector_tensor_vector(v2,psib,phib); */
/*     _su3_plus_su3(v1,v1,v2); */
    _su3_times_su3d(v2,*um,v1);
    _complex_times_su3(v1,ka2,v2);
    _trace_lambda_add_assign(df0[iy][2], v1);
/*     _trace_lambda(der,v1); */
/*     ddd=&df0[iy][2]; */
/*     _add_su3adj(*ddd,der); */

    /****************** direction +3 ***********************/

    iy=g_iup[ix][3];

    sp = k + iy;
    /*     sp=&g_spinor_field[k][icy]; */
    up=&g_gauge_field[ix][3];
      
    _vector_i_add(psia,(*sp).s0,(*sp).s2);
    _vector_i_sub(psib,(*sp).s1,(*sp).s3);

    _vector_i_add(phia,rr.s0,rr.s2);
    _vector_i_sub(phib,rr.s1,rr.s3);

    _vector_tensor_vector_add(v1, phia, psia, phib, psib);
/*     _vector_tensor_vector(v1,phia,psia); */
/*     _vector_tensor_vector(v2,phib,psib); */
/*     _su3_plus_su3(v1,v1,v2); */
    _su3_times_su3d(v2,*up,v1);
    _complex_times_su3(v1,ka3,v2);
    _trace_lambda_add_assign(df0[ix][3], v1);
/*     _trace_lambda(der,v1); */
/*     ddd=&df0[ix][3]; */
/*     _add_su3adj(*ddd,der); */

    /***************** direction -3 ************************/

    iy=g_idn[ix][3];

    sm = k + iy;
    /*     sm=&g_spinor_field[k][icy]; */
    um=&g_gauge_field[iy][3];
      
    _vector_i_sub(psia,(*sm).s0,(*sm).s2);
    _vector_i_add(psib,(*sm).s1,(*sm).s3);

    _vector_i_sub(phia,rr.s0,rr.s2);
    _vector_i_add(phib,rr.s1,rr.s3);

    _vector_tensor_vector_add(v1, psia, phia, psib, phib);
/*     _vector_tensor_vector(v1,psia,phia); */
/*     _vector_tensor_vector(v2,psib,phib); */
/*     _su3_plus_su3(v1,v1,v2); */
    _su3_times_su3d(v2,*um,v1);
    _complex_times_su3(v1,ka3,v2);
    _trace_lambda_add_assign(df0[iy][3], v1);
/*     _trace_lambda(der,v1); */
/*     ddd=&df0[iy][3]; */
/*     _add_su3adj(*ddd,der); */
     
    /****************** end of loop ************************/
  }
#ifdef _KOJAK_INST
#pragma pomp inst end(derivSb)
#endif
}
예제 #10
0
파일: diff.c 프로젝트: LorenzoRiggio/tmLQCD
void diff(spinor * const Q,spinor * const R,spinor * const S, const int N)
{
  int ix = 1;
  double *s ALIGN;
  double *sp ALIGN;
  double *r ALIGN;
  double *rp ALIGN;
  double *q ALIGN;
  double _Complex x00, x01, x02, x03, x04, x05, x06, x07, 
    x08, x09, x10, x11;
  double _Complex y00, y01, y02, y03, y04, y05, y06, y07, 
    y08, y09, y10, y11;
#pragma disjoint(*R, *S)

  __alignx(16, Q);
  __alignx(16, R);
  __alignx(16, S);
  r = (double*) R;
  s = (double*) S;
  q = (double*) Q;
  rp = r + 24;
  sp = s + 24;
  _prefetch_spinor(rp);
  _prefetch_spinor(sp);
  x00 = __lfpd(r);    
  x01 = __lfpd(r+2);  
  x02 = __lfpd(r+4);  
  x03 = __lfpd(r+6);  
  x04 = __lfpd(r+8);  
  x05 = __lfpd(r+10); 
  x06 = __lfpd(r+12); 
  x07 = __lfpd(r+14); 
  x08 = __lfpd(r+16); 
  x09 = __lfpd(r+18); 
  x10 = __lfpd(r+20); 
  x11 = __lfpd(r+22); 
  y00 = __lfpd(s);   
  y01 = __lfpd(s+2); 
  y02 = __lfpd(s+4); 
  y03 = __lfpd(s+6); 
  y04 = __lfpd(s+8); 
  y05 = __lfpd(s+10);
  y06 = __lfpd(s+12);
  y07 = __lfpd(s+14);
  y08 = __lfpd(s+16);
  y09 = __lfpd(s+18);
  y10 = __lfpd(s+20);
  y11 = __lfpd(s+22);

  __stfpd(q, __fpsub(x00, y00));
  __stfpd(q+2, __fpsub(x01, y01));
  __stfpd(q+4, __fpsub(x02, y02));
  __stfpd(q+6, __fpsub(x03, y03));
  __stfpd(q+8, __fpsub(x04, y04));
  __stfpd(q+10, __fpsub(x05, y05));
  __stfpd(q+12, __fpsub(x06, y06));
  __stfpd(q+14, __fpsub(x07, y07));
  __stfpd(q+16, __fpsub(x08, y08));
  __stfpd(q+18, __fpsub(x09, y09));
  __stfpd(q+20, __fpsub(x10, y10));
  __stfpd(q+22, __fpsub(x11, y11));
  s = sp;
  r = rp;
  q+=24;
#pragma unroll(12)
  for(ix = 1; ix < N-1; ix++) {
    rp+=24;
    sp+=24;
    _prefetch_spinor(rp);
    _prefetch_spinor(sp);
    x00 = __lfpd(r);    
    x01 = __lfpd(r+2);  
    x02 = __lfpd(r+4);  
    x03 = __lfpd(r+6);  
    x04 = __lfpd(r+8);  
    x05 = __lfpd(r+10); 
    x06 = __lfpd(r+12); 
    x07 = __lfpd(r+14); 
    x08 = __lfpd(r+16); 
    x09 = __lfpd(r+18); 
    x10 = __lfpd(r+20); 
    x11 = __lfpd(r+22); 
    y00 = __lfpd(s);   
    y01 = __lfpd(s+2); 
    y02 = __lfpd(s+4); 
    y03 = __lfpd(s+6); 
    y04 = __lfpd(s+8); 
    y05 = __lfpd(s+10);
    y06 = __lfpd(s+12);
    y07 = __lfpd(s+14);
    y08 = __lfpd(s+16);
    y09 = __lfpd(s+18);
    y10 = __lfpd(s+20);
    y11 = __lfpd(s+22);
    
    __stfpd(q, __fpsub(x00, y00));
    __stfpd(q+2, __fpsub(x01, y01));
    __stfpd(q+4, __fpsub(x02, y02));
    __stfpd(q+6, __fpsub(x03, y03));
    __stfpd(q+8, __fpsub(x04, y04));
    __stfpd(q+10, __fpsub(x05, y05));
    __stfpd(q+12, __fpsub(x06, y06));
    __stfpd(q+14, __fpsub(x07, y07));
    __stfpd(q+16, __fpsub(x08, y08));
    __stfpd(q+18, __fpsub(x09, y09));
    __stfpd(q+20, __fpsub(x10, y10));
    __stfpd(q+22, __fpsub(x11, y11));
    s = sp;
    r = rp;
    q+=24;
  }
  x00 = __lfpd(r);    
  x01 = __lfpd(r+2);  
  x02 = __lfpd(r+4);  
  x03 = __lfpd(r+6);  
  x04 = __lfpd(r+8);  
  x05 = __lfpd(r+10); 
  x06 = __lfpd(r+12); 
  x07 = __lfpd(r+14); 
  x08 = __lfpd(r+16); 
  x09 = __lfpd(r+18); 
  x10 = __lfpd(r+20); 
  x11 = __lfpd(r+22); 
  y00 = __lfpd(s);   
  y01 = __lfpd(s+2); 
  y02 = __lfpd(s+4); 
  y03 = __lfpd(s+6); 
  y04 = __lfpd(s+8); 
  y05 = __lfpd(s+10);
  y06 = __lfpd(s+12);
  y07 = __lfpd(s+14);
  y08 = __lfpd(s+16);
  y09 = __lfpd(s+18);
  y10 = __lfpd(s+20);
  y11 = __lfpd(s+22);

  __stfpd(q, __fpsub(x00, y00));
  __stfpd(q+2, __fpsub(x01, y01));
  __stfpd(q+4, __fpsub(x02, y02));
  __stfpd(q+6, __fpsub(x03, y03));
  __stfpd(q+8, __fpsub(x04, y04));
  __stfpd(q+10, __fpsub(x05, y05));
  __stfpd(q+12, __fpsub(x06, y06));
  __stfpd(q+14, __fpsub(x07, y07));
  __stfpd(q+16, __fpsub(x08, y08));
  __stfpd(q+18, __fpsub(x09, y09));
  __stfpd(q+20, __fpsub(x10, y10));
  __stfpd(q+22, __fpsub(x11, y11));

  return;
}
예제 #11
0
파일: diff.c 프로젝트: LorenzoRiggio/tmLQCD
void diff(spinor * const Q,const spinor * const R,const spinor * const S, const int N) {
#ifdef OMP
#pragma omp parallel
  {
#endif
  vector4double x0, x1, x2, x3, x4, x5, y0, y1, y2, y3, y4, y5;
  vector4double z0, z1, z2, z3, z4, z5;
  double *s, *r, *q;

  __alignx(32, s);
  __alignx(32, r);
  __alignx(32, q);
  __alignx(32, S);
  __alignx(32, R);

  __prefetch_by_load(S);
  __prefetch_by_load(R);
  __prefetch_by_load(Q);

#ifndef OMP
#pragma unroll(2)
#else
#pragma omp for
#endif
  for (int ix = 0; ix < N; ++ix) {
    s=(double*)((spinor *) S + ix);
    r=(double*)((spinor *) R + ix);
    q=(double*)((spinor *) Q + ix);
    __prefetch_by_load(S + ix + 1);
    __prefetch_by_load(R + ix + 1);
    __prefetch_by_stream(1, Q + ix + 1);
    x0 = vec_ld(0, r);
    x1 = vec_ld(0, r+4);
    x2 = vec_ld(0, r+8);
    x3 = vec_ld(0, r+12);
    x4 = vec_ld(0, r+16);
    x5 = vec_ld(0, r+20);
    y0 = vec_ld(0, s);
    y1 = vec_ld(0, s+4);
    y2 = vec_ld(0, s+8);
    y3 = vec_ld(0, s+12);
    y4 = vec_ld(0, s+16);
    y5 = vec_ld(0, s+20);
    z0 = vec_sub(x0, y0);
    z1 = vec_sub(x1, y1);
    z2 = vec_sub(x2, y2);
    z3 = vec_sub(x3, y3);
    z4 = vec_sub(x4, y4);
    z5 = vec_sub(x5, y5);
    vec_st(z0, 0, q);
    vec_st(z1, 0, q+4);
    vec_st(z2, 0, q+8);
    vec_st(z3, 0, q+12);
    vec_st(z4, 0, q+16);
    vec_st(z5, 0, q+20);
  }

#ifdef OMP
  } /* OpenMP parallel closing brace */
#endif

  return;
}
예제 #12
0
파일: scalar_prod.c 프로젝트: chjost/tmLQCD
/*  <S,R>=S^* times R */
_Complex double scalar_prod(const spinor * const S, const spinor * const R, const int N, const int parallel) {
  _Complex double ALIGN res = 0.0;
#ifdef MPI
  _Complex double ALIGN mres;
#endif

#ifdef OMP
#pragma omp parallel
  {
  int thread_num = omp_get_thread_num();
#endif

  _Complex double ALIGN ds,tr,ts,tt,ks,kc;
  const spinor *s,*r;

  ks = 0.0;
  kc = 0.0;

#if (defined BGL && defined XLC)
  __alignx(16, S);
  __alignx(16, R);
#endif

#ifdef OMP
#pragma omp for
#endif
  for (int ix = 0; ix < N; ix++)
  {
    s= S + ix;
    r= R + ix;
    
    ds = r->s0.c0 * conj(s->s0.c0) + r->s0.c1 * conj(s->s0.c1) + r->s0.c2 * conj(s->s0.c2) +
         r->s1.c0 * conj(s->s1.c0) + r->s1.c1 * conj(s->s1.c1) + r->s1.c2 * conj(s->s1.c2) +
	 r->s2.c0 * conj(s->s2.c0) + r->s2.c1 * conj(s->s2.c1) + r->s2.c2 * conj(s->s2.c2) + 
         r->s3.c0 * conj(s->s3.c0) + r->s3.c1 * conj(s->s3.c1) + r->s3.c2 * conj(s->s3.c2);

    /* Kahan Summation */
    tr=ds+kc;
    ts=tr+ks;
    tt=ts-ks;
    ks=ts;
    kc=tr-tt;
  }
  kc=ks+kc;

#ifdef OMP
  g_omp_acc_cp[thread_num] = kc;

  } /* OpenMP closing brace */

  /* having left the parallel section, we can now sum up the Kahan
     corrected sums from each thread into kc */
  for(int i = 0; i < omp_num_threads; ++i)
    res += g_omp_acc_cp[i];
#else
  res=kc;
#endif

#ifdef MPI
  if(parallel == 1)
  {
    MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);
    return(mres);
  }
#endif
  return(res);
}
예제 #13
0
void deriv_Sb_D_psi(spinor * const l, spinor * const k, 
		    hamiltonian_field_t * const hf, const double factor) {

  int ix,iy, iz;
  int ioff,ioff2,icx,icy, icz;
  su3 * restrict up ALIGN;
  su3 * restrict um ALIGN;
  su3adj * restrict ddd;
  static su3adj der;
  static su3 v1,v2;
  static su3_vector psia,psib,phia,phib;
  static spinor rr;
  spinor * restrict r ALIGN;
  spinor * restrict sp ALIGN;
  spinor * restrict sm ALIGN;

  /* We have 32 registers available */
  double _Complex reg00, reg01, reg02, reg03, reg04, reg05;
  double _Complex reg10, reg11, reg12, reg13, reg14, reg15;
  /* For su3 matrix, use reg00 for missing register */
  double _Complex v00, v01, v02, v10, v11, v12, v20, v21;
  /* The following contains the left spinor (12 regs) and the final  */
  /* su3 matrix to trace over */
  double _Complex r00, r01, r02, r10, r11, r12, r20, r21, r22, 
    r30, r31, r32;

#ifdef _KOJAK_INST
# pragma pomp inst begin(derivSb)
#endif

#pragma disjoint(*r, *sp, *sm, *up, *um, *ddd)
  __alignx(16, l);
  __alignx(16, k);

  if(ieo==0) {
    ioff=0;
  }
  else {
    ioff=(VOLUME+RAND)/2;
  } 
  ioff2=(VOLUME+RAND)/2-ioff;

  /* for parallelization */
#ifdef MPI
  xchange_field(k, ieo);
  xchange_field(l, (ieo+1)%2);
#endif
  /************** loop over all lattice sites ****************/

  ix=ioff;
  iy=g_iup[ix][0]; icy=iy;
  sp = k + icy;
  _prefetch_spinor(sp);
  up=&hf->gaugefield[ix][0];
  _prefetch_su3(up);

  for(icx = ioff; icx < (VOLUME+ioff); icx++){

    /* load left vector r and */
    /* multiply with gamma5   */
    r = l + (icx-ioff);
    ix=icx;

    /*********************** direction +0 ********************/

    ddd = &hf->derivative[ix][0];
    _bgl_load_r0((*r).s0);
    _bgl_load_r1((*r).s1);
    _bgl_load_minus_r2((*r).s2);
    _bgl_load_minus_r3((*r).s3);

    _bgl_load_reg0((*sp).s0);
    _bgl_load_reg0_up((*sp).s1);
    _bgl_load_reg1((*sp).s2);
    _bgl_load_reg1_up((*sp).s3);
    
    _bgl_add_to_reg0_reg1();
    _bgl_add_to_reg0_up_reg1_up();
      
    _bgl_add_r0_to_r2_reg1();
    _bgl_add_r1_to_r3_reg1_up();

    iy=g_idn[ix][0]; icy=iy;
    sm = k + icy;
    _prefetch_spinor(sm);
    um=&hf->gaugefield[iy][0];
    _prefetch_su3(um);

    _bgl_tensor_product_and_add();
    /* result in v now */
    _bgl_su3_times_v_dagger(*up);
    /* result in r now */
    _bgl_complex_times_r(ka0);
    _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor);

    /************** direction -0 ****************************/

    ddd = &hf->derivative[iy][0];
    _bgl_load_r0((*r).s0);
    _bgl_load_r1((*r).s1);
    _bgl_load_minus_r2((*r).s2);
    _bgl_load_minus_r3((*r).s3);

    _bgl_load_reg0((*sm).s0);
    _bgl_load_reg0_up((*sm).s1);
    _bgl_load_reg1((*sm).s2);
    _bgl_load_reg1_up((*sm).s3);

    _bgl_sub_from_reg0_reg1();
    _bgl_sub_from_reg0_up_reg1_up();

    _bgl_sub_from_r0_r2_reg1();
    _bgl_sub_from_r1_r3_reg1_up();

    iy=g_iup[ix][1]; icy=[iy];

    sp = k + icy;
    _prefetch_spinor(sp);
    up=&hf->gaugefield[ix][1];      
    _prefetch_su3(up);

    _bgl_tensor_product_and_add_d();
    /* result in v now */
    _bgl_su3_times_v_dagger(*um);
    /* result in r now */
    _bgl_complex_times_r(ka0);
    _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor);

    /*************** direction +1 **************************/

    ddd = &hf->derivative[ix][1];
    _bgl_load_r0((*r).s0);
    _bgl_load_r1((*r).s1);
    _bgl_load_minus_r2((*r).s2);
    _bgl_load_minus_r3((*r).s3);

    _bgl_load_reg0((*sp).s0);
    _bgl_load_reg0_up((*sp).s1);
    _bgl_load_reg1((*sp).s2);
    _bgl_load_reg1_up((*sp).s3);
    
    _bgl_i_mul_add_to_reg0_reg1_up();
    _bgl_i_mul_add_to_reg0_up_reg1();
      
    _bgl_i_mul_add_r0_to_r3_reg1();
    _bgl_i_mul_add_r1_to_r2_reg1_up();

    iy=g_idn[ix][1]; icy=iy;

    sm = k + icy;
    _prefetch_spinor(sm);
    um=&hf->gaugefield[iy][1];
    _prefetch_su3(um);

    _bgl_tensor_product_and_add();
    /* result in v now */
    _bgl_su3_times_v_dagger(*up);
    /* result in r now */
    _bgl_complex_times_r(ka1);
    _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor);

    /**************** direction -1 *************************/

    ddd = &hf->derivative[iy][1];
    _bgl_load_r0((*r).s0);
    _bgl_load_r1((*r).s1);
    _bgl_load_minus_r2((*r).s2);
    _bgl_load_minus_r3((*r).s3);

    _bgl_load_reg0((*sp).s0);
    _bgl_load_reg0_up((*sp).s1);
    _bgl_load_reg1((*sp).s2);
    _bgl_load_reg1_up((*sp).s3);
    
    _bgl_i_mul_sub_from_reg0_reg1_up();
    _bgl_i_mul_sub_from_reg0_up_reg1();
      
    _bgl_i_mul_sub_from_r0_r3_reg1();
    _bgl_i_mul_sub_from_r1_r2_reg1_up();

    iy=g_iup[ix][2]; icy=iy;

    sp = k + icy;
    _prefetch_spinor(sp);
    up=&hf->gaugefield[ix][2];
    _prefetch_su3(up);

    _bgl_tensor_product_and_add_d();
    /* result in v now */
    _bgl_su3_times_v_dagger(*um);
    /* result in r now */
    _bgl_complex_times_r(ka1);
    _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor);

    /*************** direction +2 **************************/

    ddd = &hf->derivative[ix][2];
    _bgl_load_r0((*r).s0);
    _bgl_load_r1((*r).s1);
    _bgl_load_minus_r2((*r).s2);
    _bgl_load_minus_r3((*r).s3);

    _bgl_load_reg0((*sp).s0);
    _bgl_load_reg0_up((*sp).s1);
    _bgl_load_reg1((*sp).s2);
    _bgl_load_reg1_up((*sp).s3);
    
    _bgl_add_to_reg0_reg1_up();
    _bgl_sub_from_reg0_up_reg1();
      
    _bgl_add_r0_to_r3_reg1();
    _bgl_sub_from_r1_r2_reg1_up();

    iy=g_idn[ix][2]; icy=iy;

    sm = k + icy;
    _prefetch_spinor(sm);
    um=&hf->gaugefield[iy][2];
    _prefetch_su3(um);

    _bgl_tensor_product_and_add();
    /* result in v now */
    _bgl_su3_times_v_dagger(*up);
    /* result in r now */
    _bgl_complex_times_r(ka2);
    _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor);
      
    /***************** direction -2 ************************/

    ddd = &hf->derivative[iy][2];
    _bgl_load_r0((*r).s0);
    _bgl_load_r1((*r).s1);
    _bgl_load_minus_r2((*r).s2);
    _bgl_load_minus_r3((*r).s3);

    _bgl_load_reg0((*sp).s0);
    _bgl_load_reg0_up((*sp).s1);
    _bgl_load_reg1((*sp).s2);
    _bgl_load_reg1_up((*sp).s3);
    
    _bgl_sub_from_reg0_reg1_up();
    _bgl_add_to_reg0_up_reg1();
      
    _bgl_sub_from_r0_r3_reg1();
    _bgl_add_r1_to_r2_reg1_up();

    iy=g_iup[ix][3]; icy=iy;

    sp = k + icy;
    _prefetch_spinor(sp);
    up=&hf->gaugefield[ix][3];
    _prefetch_su3(up);

    _bgl_tensor_product_and_add_d();
    /* result in v now */
    _bgl_su3_times_v_dagger(*um);
    /* result in r now */
    _bgl_complex_times_r(ka1);
    _bgl_trace_lambda_mul_add_assign(*ddd, 2.*factor);

    /****************** direction +3 ***********************/

    ddd = &hf->derivative[ix][3];
    _bgl_load_r0((*r).s0);
    _bgl_load_r1((*r).s1);
    _bgl_load_minus_r2((*r).s2);
    _bgl_load_minus_r3((*r).s3);

    _bgl_load_reg0((*sp).s0);
    _bgl_load_reg0_up((*sp).s1);
    _bgl_load_reg1((*sp).s2);
    _bgl_load_reg1_up((*sp).s3);
    
    _bgl_i_mul_add_to_reg0_reg1();
    _bgl_i_mul_sub_from_reg0_up_reg1_up();
      
    _bgl_i_mul_add_r0_to_r2_reg1();
    _bgl_i_mul_sub_from_r1_r3_reg1_up();

    iy=g_idn[ix][3]; icy=iy;

    sm = k + icy;
    _prefetch_spinor(sm);
    um=&hf->gaugefield[iy][3];
    _prefetch_su3(um);

    _bgl_tensor_product_and_add();
    /* result in v now */
    _bgl_su3_times_v_dagger(*up);
    /* result in r now */
    _bgl_complex_times_r(ka3);
    _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor);

    /***************** direction -3 ************************/

    ddd = &hf->derivative[iy][3];
    _bgl_load_r0((*r).s0);
    _bgl_load_r1((*r).s1);
    _bgl_load_minus_r2((*r).s2);
    _bgl_load_minus_r3((*r).s3);

    _bgl_load_reg0((*sp).s0);
    _bgl_load_reg0_up((*sp).s1);
    _bgl_load_reg1((*sp).s2);
    _bgl_load_reg1_up((*sp).s3);
    
    _bgl_i_mul_sub_from_reg0_reg1();
    _bgl_i_mul_add_to_reg0_up_reg1_up();
      
    _bgl_i_mul_sub_from_r0_r2_reg1();
    _bgl_i_mul_add_r1_to_r3_reg1_up();

    /* something wrong here...*/
    icz=icx+1;
    if(icz==((VOLUME+RAND)/2+ioff)) icz=ioff;
    iz=icz;
    iy=g_iup[iz][0]; icy=iy;

    sp = k + icy;
    _prefetch_spinor(sp);
    up=&hf->gaugefield[iz][0];
    _prefetch_su3(up);

    _bgl_tensor_product_and_add_d();
    /* result in v now */
    _bgl_su3_times_v_dagger(*um);
    /* result in r now */
    _bgl_complex_times_r(ka3);
    _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor);

    /****************** end of loop ************************/
  }
#ifdef _KOJAK_INST
#pragma pomp inst end(derivSb)
#endif
}
예제 #14
0
파일: D_psi.c 프로젝트: VincentDrach/tmLQCD
/* this is the hopping part only */
void local_H(spinor * const rr, spinor * const s, su3 * u, int * _idx) {

  int * idx = _idx;
  su3 * restrict up ALIGN;
  su3 * restrict um ALIGN;
  spinor * restrict sp ALIGN;
  spinor * restrict sm ALIGN;

#pragma disjoint(*s, *sp, *sm, *rr, *up, *um)

  __alignx(16,rr);
  __alignx(16,s);

  /*********************** direction +0 ************************/
  up = u;
  sp = (spinor *) s + (*idx);
  idx++;

  um = up+1;
  _prefetch_su3(um); 
  sm = (spinor *) s + (*idx);
  _prefetch_spinor(sm); 
  idx++;

  _bgl_load_reg0(sp->s0);
  _bgl_load_reg1(sp->s1);
  _bgl_load_reg0_up(sp->s2);
  _bgl_load_reg1_up(sp->s3);
  _bgl_vector_add_reg0();
  _bgl_vector_add_reg1();
  /* result is now in regx0, regx1, regx2 x = 0,1 */
  
  _bgl_su3_multiply_double((*up));
  _bgl_vector_cmplx_mul_double(phase_0);
  _bgl_add_to_rs0_reg0();
  _bgl_add_to_rs2_reg0();
  _bgl_add_to_rs1_reg1();
  _bgl_add_to_rs3_reg1();

  /*********************** direction -0 ************************/
  up = um+1;
  _prefetch_su3(up); 
  sp = (spinor*) s + (*idx);
  _prefetch_spinor(sp); 
  idx++;

  _bgl_load_reg0(sm->s0);
  _bgl_load_reg1(sm->s1);
  _bgl_load_reg0_up(sm->s2);
  _bgl_load_reg1_up(sm->s3);
  _bgl_vector_sub_reg0();
  _bgl_vector_sub_reg1();
  
  _bgl_su3_inverse_multiply_double((*um));
  _bgl_vector_cmplxcg_mul_double(phase_0);
  
  _bgl_add_to_rs0_reg0();
  _bgl_sub_from_rs2_reg0();
  _bgl_add_to_rs1_reg1();
  _bgl_sub_from_rs3_reg1();
  
  /*********************** direction +1 ************************/
  
  um = up+1;
  _prefetch_su3(um); 
  sm = (spinor*) s + (*idx);
  _prefetch_spinor(sm); 
  idx++;

  _bgl_load_reg0(sp->s0);
  _bgl_load_reg1(sp->s1);
  _bgl_load_reg0_up(sp->s3);
  _bgl_load_reg1_up(sp->s2);
  _bgl_vector_i_mul_add_reg0();
  _bgl_vector_i_mul_add_reg1();
  
  _bgl_su3_multiply_double((*up));
  _bgl_vector_cmplx_mul_double(phase_1);
  
  _bgl_add_to_rs0_reg0();
  _bgl_i_mul_sub_from_rs3_reg0();
  _bgl_add_to_rs1_reg1();
  _bgl_i_mul_sub_from_rs2_reg1();
  
  /*********************** direction -1 ************************/

  up = um+1;
  _prefetch_su3(up); 
  sp = (spinor*) s + (*idx);
  _prefetch_spinor(sp); 
  idx++;

  _bgl_load_reg0(sm->s0);
  _bgl_load_reg1(sm->s1);
  _bgl_load_reg0_up(sm->s3);
  _bgl_load_reg1_up(sm->s2);
  _bgl_vector_i_mul_sub_reg0();
  _bgl_vector_i_mul_sub_reg1();
  
  _bgl_su3_inverse_multiply_double((*um));
  _bgl_vector_cmplxcg_mul_double(phase_1);
  
  _bgl_add_to_rs0_reg0();
  _bgl_add_to_rs1_reg1();
  _bgl_i_mul_add_to_rs3_reg0();
  _bgl_i_mul_add_to_rs2_reg1();      
  
  /*********************** direction +2 ************************/
  
  um = up+1;
  _prefetch_su3(um); 
  sm = (spinor*) s + (*idx);
  _prefetch_spinor(sm); 
  idx++;

  _bgl_load_reg0(sp->s0);
  _bgl_load_reg1(sp->s1);
  _bgl_load_reg1_up(sp->s2);
  _bgl_load_reg0_up(sp->s3);
  _bgl_vector_add_reg0();
  _bgl_vector_sub_reg1();
  
  _bgl_su3_multiply_double((*up));
  _bgl_vector_cmplx_mul_double(phase_2);
  
  _bgl_add_to_rs0_reg0();
  _bgl_add_to_rs1_reg1();
  _bgl_sub_from_rs2_reg1();
  _bgl_add_to_rs3_reg0();
  

  /*********************** direction -2 ************************/
  up = um+1;
  _prefetch_su3(up); 
  sp = (spinor*) s + (*idx);
  _prefetch_spinor(sp); 
  idx++;

  _bgl_load_reg0(sm->s0);
  _bgl_load_reg1(sm->s1);
  _bgl_load_reg1_up(sm->s2);
  _bgl_load_reg0_up(sm->s3);
  _bgl_vector_sub_reg0();
  _bgl_vector_add_reg1();
  
  _bgl_su3_inverse_multiply_double((*um));
  _bgl_vector_cmplxcg_mul_double(phase_2);
  
  _bgl_add_to_rs0_reg0();
  _bgl_add_to_rs1_reg1();
  _bgl_add_to_rs2_reg1();
  _bgl_sub_from_rs3_reg0();
  
  /*********************** direction +3 ************************/
  um = up+1;
  _prefetch_su3(um); 
  sm = (spinor*) s + (*idx);
  _prefetch_spinor(sm); 

  _bgl_load_reg0(sp->s0);
  _bgl_load_reg1(sp->s1);
  _bgl_load_reg0_up(sp->s2);
  _bgl_load_reg1_up(sp->s3);
  _bgl_vector_i_mul_add_reg0();
  _bgl_vector_i_mul_sub_reg1();
  
  _bgl_su3_multiply_double((*up));
  _bgl_vector_cmplx_mul_double(phase_3);
  
  _bgl_add_to_rs0_reg0();
  _bgl_add_to_rs1_reg1();
  _bgl_i_mul_sub_from_rs2_reg0();
  _bgl_i_mul_add_to_rs3_reg1();
  
  /*********************** direction -3 ************************/

  _bgl_load_reg0(sm->s0);
  _bgl_load_reg1(sm->s1);
  _bgl_load_reg0_up(sm->s2);
  _bgl_load_reg1_up(sm->s3);
  _bgl_vector_i_mul_sub_reg0();
  _bgl_vector_i_mul_add_reg1();
  
  _bgl_su3_inverse_multiply_double((*um));
  _bgl_vector_cmplxcg_mul_double(phase_3);
  
  _bgl_add_to_rs0_reg0();
  _bgl_store_rs0(rr->s0);
  _bgl_i_mul_add_to_rs2_reg0();
  _bgl_store_rs2(rr->s2);
  
  _bgl_add_to_rs1_reg1();
  _bgl_store_rs1(rr->s1);
  _bgl_i_mul_sub_from_rs3_reg1();
  _bgl_store_rs3(rr->s3);

}
예제 #15
0
void xchange_field(spinor * const l, const int ieo) {

#ifdef MPI
  MPI_Request requests[16];
  MPI_Status status[16];
#endif
  int ireq;
#  if ( defined PARALLELT || defined PARALLELX )
  int reqcount = 4;
#  elif ( defined PARALLELXT || defined PARALLELXY )
  int reqcount = 8;
#  elif ( defined PARALLELXYT || defined PARALLELXYZ )
  int reqcount = 12;
#  elif defined PARALLELXYZT
  int ix=0;
  int reqcount = 16;
#  endif

#ifdef _KOJAK_INST
#pragma pomp inst begin(xchangefield)
#endif
#  if (defined BGL && defined XLC)
  __alignx(16, l);
#  endif

#  ifdef MPI


  /* In 4 dimensions there are two processors sharing the   */
  /* communication bandwidth. So the first should start     */
  /* in forward direction, the second in backward direction */
  /* This might only work if the third direction is         */
  /* parallelised only on the node                          */
  if(g_proc_coords[3]%2 == 0) {

    ireq=0;

#    if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT )
    /* send the data to the neighbour on the left */
    /* recieve the data from the neighbour on the right */
    MPI_Isend((void*)(l+g_1st_t_int_dn), 1, field_time_slice_cont, g_nb_t_dn, 81, g_cart_grid, &requests[ireq]);
    MPI_Irecv( (void*)(l+g_1st_t_ext_up), 1, field_time_slice_cont, g_nb_t_up, 81, g_cart_grid, &requests[ireq+1]);
    ireq=ireq+4;
#    endif
    
#    if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ )
    /* send the data to the neighbour on the left in x direction */
    /* recieve the data from the neighbour on the right in x direction */
    MPI_Isend((void*)(l+g_1st_x_int_dn), 1, field_x_slice_gath, g_nb_x_dn, 91, g_cart_grid,  &requests[ireq]);
    MPI_Irecv((void*)(l+g_1st_x_ext_up), 1, field_x_slice_cont, g_nb_x_up, 91, g_cart_grid, &requests[ireq+1]);
    ireq=ireq+4;
#    endif
    
#    if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ )
    /* send the data to the neighbour on the left in y direction */
    /* recieve the data from the neighbour on the right in y direction */
    MPI_Isend((void*)(l+g_1st_y_int_dn), 1, field_y_slice_gath, g_nb_y_dn, 101, g_cart_grid, &requests[ireq]);
    MPI_Irecv((void*)(l+g_1st_y_ext_up), 1, field_y_slice_cont, g_nb_y_up, 101, g_cart_grid, &requests[ireq+1]);
    ireq=ireq+4;
#    endif

#    if (defined PARALLELXYZT || defined PARALLELXYZ )
    /* This is now depending on whether the field is even or odd */
    /* send the data to the neighbour on the left in z direction */
    /* recieve the data from the neighbour on the right in z direction */


    if(ieo == 1) { 
      MPI_Isend((void*)(l+g_1st_z_int_dn),1,field_z_slice_even_dn,g_nb_z_dn,503,g_cart_grid,&requests[ireq]);
      MPI_Irecv((void*)(l+g_1st_z_ext_up),1,field_z_slice_cont,g_nb_z_up,503,g_cart_grid,&requests[ireq+1]); 
    } else { 
      MPI_Isend((void*)(l+g_1st_z_int_dn),1,field_z_slice_odd_dn,g_nb_z_dn,503,g_cart_grid,&requests[ireq]);
      MPI_Irecv((void*)(l+g_1st_z_ext_up),1,field_z_slice_cont,g_nb_z_up,503,g_cart_grid,&requests[ireq+1]); 
    } 

#    endif


    ireq=2;

#    if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT )
    /* send the data to the neighbour on the right */
    /* recieve the data from the neighbour on the left */
    MPI_Isend((void*)(l+g_1st_t_int_up), 1, field_time_slice_cont, g_nb_t_up, 82, g_cart_grid, &requests[ireq]);
    MPI_Irecv((void*)(l+g_1st_t_ext_dn), 1, field_time_slice_cont, g_nb_t_dn, 82, g_cart_grid, &requests[ireq+1]);
    ireq=ireq+4;
#    endif
    
#    if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ )
    /* send the data to the neighbour on the right in x direction */
    /* recieve the data from the neighbour on the left in x direction */  
    MPI_Isend((void*)(l+g_1st_x_int_up), 1, field_x_slice_gath, g_nb_x_up, 92, g_cart_grid, &requests[ireq]);
    MPI_Irecv((void*)(l+g_1st_x_ext_dn), 1, field_x_slice_cont, g_nb_x_dn, 92, g_cart_grid, &requests[ireq+1]);
    ireq=ireq+4;
#    endif
    
#    if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ )
    /* send the data to the neighbour on the right in y direction */
    /* recieve the data from the neighbour on the left in y direction */  
    MPI_Isend((void*)(l+g_1st_y_int_up), 1, field_y_slice_gath, g_nb_y_up, 102, g_cart_grid, &requests[ireq]);
    MPI_Irecv((void*)(l+g_1st_y_ext_dn), 1, field_y_slice_cont, g_nb_y_dn, 102, g_cart_grid, &requests[ireq+1]);
    ireq=ireq+4;
#    endif
    
#    if ( defined PARALLELXYZT || defined PARALLELXYZ )
  /* send the data to the neighbour on the right in z direction */
  /* recieve the data from the neighbour on the left in z direction */  
    if(ieo == 1) { 
      MPI_Isend((void*)(l+g_1st_z_int_up),1,field_z_slice_even_up,g_nb_z_up,504,g_cart_grid,&requests[ireq]);
      MPI_Irecv((void*)(l+g_1st_z_ext_dn),1,field_z_slice_cont,g_nb_z_dn,504,g_cart_grid,&requests[ireq+1]); 
    } else {  
      MPI_Isend((void*)(l+g_1st_z_int_up),1,field_z_slice_odd_up,g_nb_z_up,504,g_cart_grid,&requests[ireq]);
      MPI_Irecv((void*)(l+g_1st_z_ext_dn),1,field_z_slice_cont,g_nb_z_dn,504,g_cart_grid,&requests[ireq+1]); 
    } 
#    endif

  } else {
    ireq=0;
    
#    if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT )
    /* send the data to the neighbour on the right */
    /* recieve the data from the neighbour on the left */
    MPI_Isend((void*)(l+g_1st_t_int_up), 1, field_time_slice_cont, g_nb_t_up, 82, g_cart_grid, &requests[ireq]);
    MPI_Irecv((void*)(l+g_1st_t_ext_dn), 1, field_time_slice_cont, g_nb_t_dn, 82, g_cart_grid, &requests[ireq+1]);
    ireq=ireq+4;
#    endif

#    if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ )
    /* send the data to the neighbour on the right in x direction */
    /* recieve the data from the neighbour on the left in x direction */  
    MPI_Isend((void*)(l+g_1st_x_int_up), 1, field_x_slice_gath, g_nb_x_up, 92, g_cart_grid, &requests[ireq]);
    MPI_Irecv((void*)(l+g_1st_x_ext_dn), 1, field_x_slice_cont, g_nb_x_dn, 92, g_cart_grid, &requests[ireq+1]);
    ireq=ireq+4;
#    endif
    
#    if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ )
    /* send the data to the neighbour on the right in y direction */
    /* recieve the data from the neighbour on the left in y direction */  
    MPI_Isend((void*)(l+g_1st_y_int_up), 1, field_y_slice_gath, g_nb_y_up, 102, g_cart_grid, &requests[ireq]);
    MPI_Irecv((void*)(l+g_1st_y_ext_dn), 1, field_y_slice_cont, g_nb_y_dn, 102, g_cart_grid, &requests[ireq+1]);
    ireq=ireq+4;
#    endif

#    if (defined PARALLELXYZT || defined PARALLELXYZ )
    /* This is now depending on whether the field is even or odd */
    /* send the data to the neighbour on the left in z direction */
    /* recieve the data from the neighbour on the right in z direction */
    if(ieo == 1) { 
      MPI_Isend((void*)(l+g_1st_z_int_dn),1,field_z_slice_even_dn,g_nb_z_dn,503,g_cart_grid,&requests[ireq]);
      MPI_Irecv((void*)(l+g_1st_z_ext_up),1,field_z_slice_cont,g_nb_z_up,503,g_cart_grid,&requests[ireq+1]);
    } else { 
      MPI_Isend((void*)(l+g_1st_z_int_dn),1,field_z_slice_odd_dn,g_nb_z_dn,503,g_cart_grid,&requests[ireq]);
      MPI_Irecv((void*)(l+g_1st_z_ext_up),1,field_z_slice_cont,g_nb_z_up,503,g_cart_grid,&requests[ireq+1]);
    } 
#    endif    

    ireq=2;

#    if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT )
    /* send the data to the neighbour on the left */
    /* recieve the data from the neighbour on the right */
    MPI_Isend((void*)(l+g_1st_t_int_dn), 1, field_time_slice_cont, g_nb_t_dn, 81, g_cart_grid, &requests[ireq]);
    MPI_Irecv((void*)(l+g_1st_t_ext_up), 1, field_time_slice_cont, g_nb_t_up, 81, g_cart_grid, &requests[ireq+1]);
    ireq=ireq+4;
#    endif

#    if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ )
    /* send the data to the neighbour on the left in x direction */
    /* recieve the data from the neighbour on the right in x direction */
    MPI_Isend((void*)(l+g_1st_x_int_dn), 1, field_x_slice_gath, g_nb_x_dn, 91, g_cart_grid, &requests[ireq]);
    MPI_Irecv((void*)(l+g_1st_x_ext_up), 1, field_x_slice_cont, g_nb_x_up, 91, g_cart_grid, &requests[ireq+1]);
    ireq=ireq+4;
#    endif
    
#    if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ )
    /* send the data to the neighbour on the left in y direction */
    /* recieve the data from the neighbour on the right in y direction */
    MPI_Isend((void*)(l+g_1st_y_int_dn), 1, field_y_slice_gath, g_nb_y_dn, 101, g_cart_grid, &requests[ireq]);
    MPI_Irecv((void*)(l+g_1st_y_ext_up), 1, field_y_slice_cont, g_nb_y_up, 101, g_cart_grid, &requests[ireq+1]);
    ireq=ireq+4;
#    endif
    
#    if ( defined PARALLELXYZT  || defined PARALLELXYZ )
    /* send the data to the neighbour on the right in z direction */
    /* recieve the data from the neighbour on the left in z direction */  
    if(ieo == 1) { 
      MPI_Isend((void*)(l+g_1st_z_int_up),1,field_z_slice_even_up,g_nb_z_up,504,g_cart_grid,&requests[ireq]);
      MPI_Irecv((void*)(l+g_1st_z_ext_dn),1,field_z_slice_cont,g_nb_z_dn,504,g_cart_grid,&requests[ireq+1]);
    } else { 
      MPI_Isend((void*)(l+g_1st_z_int_up),1,field_z_slice_odd_up,g_nb_z_up,504,g_cart_grid,&requests[ireq]);
      MPI_Irecv((void*)(l+g_1st_z_ext_dn),1,field_z_slice_cont,g_nb_z_dn,504,g_cart_grid,&requests[ireq+1]);
    } 
#    endif

  }
  MPI_Waitall(reqcount, requests, status);


#  endif /* MPI */
  return;
#ifdef _KOJAK_INST
#pragma pomp inst end(xchangefield)
#endif
}
예제 #16
0
void xchange_field(spinor * const l, const int ieo) {

#ifdef MPI
  MPI_Request requests[16];
  MPI_Status status[16];
#endif
#  ifdef PARALLELT
  int reqcount = 4;
#  elif defined PARALLELXT
  int reqcount = 8;
#  elif defined PARALLELXYT
  int reqcount = 12;
#  elif defined PARALLELXYZT
  int ix=0;
  int reqcount = 16;
#  endif

#ifdef _KOJAK_INST
#pragma pomp inst begin(xchangefield)
#endif
#  if (defined BGL && defined XLC)
#    ifdef PARALLELXYZT
  __alignx(16, field_buffer_z);
  __alignx(16, field_buffer_z2);
#    endif
  __alignx(16, l);
#  endif

#  ifdef MPI


  /* In 4 dimensions there are two processors sharing the   */
  /* communication bandwidth. So the first should start     */
  /* in forward direction, the second in backward direction */
  /* This might only work if the third direction is         */
  /* parallelised only on the node                          */
  if(g_proc_coords[3]%2 == 0) {
    /* send the data to the neighbour on the left */
    /* recieve the data from the neighbour on the right */
    MPI_Isend((void*)l, 1, field_time_slice_cont, g_nb_t_dn, 81, g_cart_grid, &requests[0]);
    MPI_Irecv((void*)(l+T*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_up, 81, g_cart_grid, &requests[1]);
#    if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
    /* send the data to the neighbour on the left in x direction */
    /* recieve the data from the neighbour on the right in x direction */
    MPI_Isend((void*)l, 1, field_x_slice_gath, g_nb_x_dn, 91, g_cart_grid,  &requests[4]);
    MPI_Irecv((void*)(l+(T+2)*LX*LY*LZ/2), 1, field_x_slice_cont, g_nb_x_up, 91, g_cart_grid, &requests[5]);
#    endif
    
#    if (defined PARALLELXYT || defined PARALLELXYZT)
    /* send the data to the neighbour on the left in y direction */
    /* recieve the data from the neighbour on the right in y direction */
    MPI_Isend((void*)l, 1, field_y_slice_gath, g_nb_y_dn, 101, g_cart_grid, &requests[8]);
    MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + 2*T*LY*LZ)/2), 1, field_y_slice_cont, g_nb_y_up, 101, g_cart_grid, &requests[9]);
#    endif
    
#    if (defined PARALLELXYZT)
    /* fill buffer ! */
    /* This is now depending on whether the field is */
    /* even or odd */
    if(ieo == 1) { 
      for(ix = 0; ix < T*LX*LY/2; ix++) { 
 	field_buffer_z[ix] = l[ g_field_z_ipt_even[ix] ];  
      } 
    } 
    else { 
      for(ix = 0; ix < T*LX*LY/2; ix++) { 
 	field_buffer_z[ix] = l[ g_field_z_ipt_odd[ix] ];  
      } 
    } 

    /* send the data to the neighbour on the left in z direction */
    /* recieve the data from the neighbour on the right in z direction */
    MPI_Isend((void*)field_buffer_z, 12*T*LX*LY, MPI_DOUBLE, g_nb_z_dn, 503, g_cart_grid, &requests[12]);
    MPI_Irecv((void*)(l+(VOLUME/2 + LX*LY*LZ + T*LY*LZ +T*LX*LZ)), 12*T*LX*LY, MPI_DOUBLE, g_nb_z_up, 503, g_cart_grid, &requests[13]); 

#    endif
    /* send the data to the neighbour on the right */
    /* recieve the data from the neighbour on the left */
    MPI_Isend((void*)(l+(T-1)*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_up, 82, g_cart_grid, &requests[2]);
    MPI_Irecv((void*)(l+(T+1)*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_dn, 82, g_cart_grid, &requests[3]);
    
#    if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
    /* send the data to the neighbour on the right in x direction */
    /* recieve the data from the neighbour on the left in x direction */  
    MPI_Isend((void*)(l+(LX-1)*LY*LZ/2), 1, field_x_slice_gath, g_nb_x_up, 92, g_cart_grid, &requests[6]);
    MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + T*LY*LZ)/2), 1, field_x_slice_cont, g_nb_x_dn, 92, g_cart_grid, &requests[7]);
#    endif
    
#    if (defined PARALLELXYT || defined PARALLELXYZT)
    /* send the data to the neighbour on the right in y direction */
    /* recieve the data from the neighbour on the left in y direction */  
    MPI_Isend((void*)(l+(LY-1)*LZ/2), 1, field_y_slice_gath, g_nb_y_up, 102, g_cart_grid, &requests[10]);
    MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + 2*T*LY*LZ + T*LX*LZ)/2), 1, field_y_slice_cont, g_nb_y_dn, 102, g_cart_grid, &requests[11]);
#    endif
    
#    if defined PARALLELXYZT
    if(ieo == 1) { 
      for(ix = T*LX*LY/2; ix < T*LX*LY; ix++) { 
	field_buffer_z2[ix-T*LX*LY/2] = l[ g_field_z_ipt_even[ix] ];  
      } 
    } 
    else { 
      for(ix = T*LX*LY/2; ix < T*LX*LY; ix++) { 
 	field_buffer_z2[ix-T*LX*LY/2] = l[ g_field_z_ipt_odd[ix] ];  
      } 
    } 
  /* send the data to the neighbour on the right in y direction */
  /* recieve the data from the neighbour on the left in y direction */  
    MPI_Isend((void*)field_buffer_z2, 12*T*LX*LY, MPI_DOUBLE, g_nb_z_up, 504, g_cart_grid, &requests[14]);
    MPI_Irecv((void*)(l+(VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + T*LX*LY)/2), 12*T*LX*LY, MPI_DOUBLE, g_nb_z_dn, 504, g_cart_grid, &requests[15]); 
#    endif

  }
  else {
    /* send the data to the neighbour on the right */
    /* recieve the data from the neighbour on the left */
    MPI_Isend((void*)(l+(T-1)*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_up, 82, g_cart_grid, &requests[0]);
    MPI_Irecv((void*)(l+(T+1)*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_dn, 82, g_cart_grid, &requests[1]);
#    if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
    /* send the data to the neighbour on the right in x direction */
    /* recieve the data from the neighbour on the left in x direction */  
    MPI_Isend((void*)(l+(LX-1)*LY*LZ/2), 1, field_x_slice_gath, g_nb_x_up, 92, g_cart_grid, &requests[4]);
    MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + T*LY*LZ)/2), 1, field_x_slice_cont, g_nb_x_dn, 92, g_cart_grid, &requests[5]);
#    endif
    
#    if (defined PARALLELXYT || defined PARALLELXYZT)
    /* send the data to the neighbour on the right in y direction */
    /* recieve the data from the neighbour on the left in y direction */  
    MPI_Isend((void*)(l+(LY-1)*LZ/2), 1, field_y_slice_gath, g_nb_y_up, 102, g_cart_grid, &requests[8]);
    MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + 2*T*LY*LZ + T*LX*LZ)/2), 1, field_y_slice_cont, g_nb_y_dn, 102, g_cart_grid, &requests[9]);
#    endif

#    if (defined PARALLELXYZT)
    /* fill buffer ! */
    /* This is now depending on whether the field is */
    /* even or odd */
    if(ieo == 1) { 
      for(ix = 0; ix < T*LX*LY/2; ix++) { 
 	field_buffer_z[ix] = l[ g_field_z_ipt_even[ix] ];  
      } 
    } 
    else { 
      for(ix = 0; ix < T*LX*LY/2; ix++) { 
 	field_buffer_z[ix] = l[ g_field_z_ipt_odd[ix] ];  
      } 
    } 
    /* send the data to the neighbour on the left in z direction */
    /* recieve the data from the neighbour on the right in z direction */
    MPI_Isend((void*)field_buffer_z, 12*T*LX*LY, MPI_DOUBLE, g_nb_z_dn, 503, g_cart_grid, &requests[12]);
    MPI_Irecv((void*)(l+(VOLUME/2 + LX*LY*LZ + T*LY*LZ +T*LX*LZ)), 12*T*LX*LY, MPI_DOUBLE, g_nb_z_up, 503, g_cart_grid, &requests[13]); 
#    endif    

    /* send the data to the neighbour on the left */
    /* recieve the data from the neighbour on the right */
    MPI_Isend((void*)l, 1, field_time_slice_cont, g_nb_t_dn, 81, g_cart_grid, &requests[2]);
    MPI_Irecv((void*)(l+T*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_up, 81, g_cart_grid, &requests[3]);
#    if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
    /* send the data to the neighbour on the left in x direction */
    /* recieve the data from the neighbour on the right in x direction */
    MPI_Isend((void*)l, 1, field_x_slice_gath, g_nb_x_dn, 91, g_cart_grid,  &requests[6]);
    MPI_Irecv((void*)(l+(T+2)*LX*LY*LZ/2), 1, field_x_slice_cont, g_nb_x_up, 91, g_cart_grid, &requests[7]);
#    endif
    
#    if (defined PARALLELXYT || defined PARALLELXYZT)
    /* send the data to the neighbour on the left in y direction */
    /* recieve the data from the neighbour on the right in y direction */
    MPI_Isend((void*)l, 1, field_y_slice_gath, g_nb_y_dn, 101, g_cart_grid, &requests[10]);
    MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + 2*T*LY*LZ)/2), 1, field_y_slice_cont, g_nb_y_up, 101, g_cart_grid, &requests[11]);
#    endif
    
#    if defined PARALLELXYZT
    if(ieo == 1) { 
      for(ix = T*LX*LY/2; ix < T*LX*LY; ix++) { 
 	field_buffer_z2[ix-T*LX*LY/2] = l[ g_field_z_ipt_even[ix] ];  
      } 
    } 
    else { 
      for(ix = T*LX*LY/2; ix < T*LX*LY; ix++) { 
 	field_buffer_z2[ix-T*LX*LY/2] = l[ g_field_z_ipt_odd[ix] ];  
      } 
    } 
    /* send the data to the neighbour on the right in y direction */
    /* recieve the data from the neighbour on the left in y direction */  
    MPI_Isend((void*)field_buffer_z2, 12*T*LX*LY, MPI_DOUBLE, g_nb_z_up, 504, g_cart_grid, &requests[14]);
    MPI_Irecv((void*)(l+(VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + T*LX*LY)/2), 12*T*LX*LY, MPI_DOUBLE, g_nb_z_dn, 504, g_cart_grid, &requests[15]); 
#    endif

  }
  MPI_Waitall(reqcount, requests, status);

  //fprintf(stdout,"recv-rank=%d_",g_proc_id); // DEBUG
  //for (ix = 0; ix < T*LX*LY/2; ix++){
  //  fprintf(stdout,"%e:",(*(l+(VOLUME/2 + LX*LY*LZ + T*LY*LZ +T*LX*LZ)+ix)).s0.c0.re);
  //}
  //fprintf(stdout,"\n");
  //fflush(stdout);

#  endif


  return;
#ifdef _KOJAK_INST
#pragma pomp inst end(xchangefield)
#endif
}
예제 #17
0
double square_norm(spinor * const P, const int N, const int parallel) {
  int ix=0;
  double res, res2;
  double *s ALIGN;
  double *sp ALIGN;
  double _Complex x00, x01, x02, x03, x04, x05, x06, x07, 
    x08, x09, x10, x11;
  double _Complex y00, y01, y02, y03, y04, y05, y06, y07, 
    y08, y09, y10, y11;

  __alignx(16, P);
  s = (double*)P;
  sp = s+24;
  _prefetch_spinor(sp);
  x00 = __lfpd(s);
  x01 = __lfpd(s+2);
  x02 = __lfpd(s+4);
  x03 = __lfpd(s+6);
  x04 = __lfpd(s+8);
  x05 = __lfpd(s+10);
  x06 = __lfpd(s+12);
  x07 = __lfpd(s+14);
  x08 = __lfpd(s+16);
  x09 = __lfpd(s+18);
  x10 = __lfpd(s+20);
  x11 = __lfpd(s+22);
  
  y00 = __fpmul(x00, x00);
  y01 = __fpmul(x01, x01);
  y02 = __fpmul(x02, x02);
  y03 = __fpmul(x03, x03);
  y04 = __fpmul(x04, x04);
  y05 = __fpmul(x05, x05);
  y06 = __fpmul(x06, x06);
  y07 = __fpmul(x07, x07);
  y08 = __fpmul(x08, x08);
  y09 = __fpmul(x09, x09);
  y10 = __fpmul(x10, x10);
  y11 = __fpmul(x11, x11);
  s = sp;


#pragma unroll(12)
  for(ix = 1; ix < N-1; ix++) {
    sp+=24;;
    _prefetch_spinor(sp);
    x00 = __lfpd(s);   
    x01 = __lfpd(s+2); 
    x02 = __lfpd(s+4); 
    x03 = __lfpd(s+6); 
    x04 = __lfpd(s+8); 
    x05 = __lfpd(s+10);
    x06 = __lfpd(s+12);
    x07 = __lfpd(s+14);
    x08 = __lfpd(s+16);
    x09 = __lfpd(s+18);
    x10 = __lfpd(s+20);
    x11 = __lfpd(s+22);
    y00 = __fpmadd(y00, x00, x00); 
    y01 = __fpmadd(y01, x01, x01); 
    y02 = __fpmadd(y02, x02, x02); 
    y03 = __fpmadd(y03, x03, x03); 
    y04 = __fpmadd(y04, x04, x04); 
    y05 = __fpmadd(y05, x05, x05); 
    y06 = __fpmadd(y06, x06, x06); 
    y07 = __fpmadd(y07, x07, x07); 
    y08 = __fpmadd(y08, x08, x08); 
    y09 = __fpmadd(y09, x09, x09); 
    y10 = __fpmadd(y10, x10, x10); 
    y11 = __fpmadd(y11, x11, x11); 
    s=sp;
  }
  x00 = __lfpd(s);   
  x01 = __lfpd(s+2); 
  x02 = __lfpd(s+4); 
  x03 = __lfpd(s+6); 
  x04 = __lfpd(s+8); 
  x05 = __lfpd(s+10);
  x06 = __lfpd(s+12);
  x07 = __lfpd(s+14);
  x08 = __lfpd(s+16);
  x09 = __lfpd(s+18);
  x10 = __lfpd(s+20);
  x11 = __lfpd(s+22);
  y00 = __fpmadd(y00, x00, x00); 
  y01 = __fpmadd(y01, x01, x01); 
  y02 = __fpmadd(y02, x02, x02); 
  y03 = __fpmadd(y03, x03, x03); 
  y04 = __fpmadd(y04, x04, x04); 
  y05 = __fpmadd(y05, x05, x05); 
  y06 = __fpmadd(y06, x06, x06); 
  y07 = __fpmadd(y07, x07, x07); 
  y08 = __fpmadd(y08, x08, x08); 
  y09 = __fpmadd(y09, x09, x09); 
  y10 = __fpmadd(y10, x10, x10); 
  y11 = __fpmadd(y11, x11, x11); 
  
  y00 = __fpadd(y00, y01);
  y02 = __fpadd(y02, y03);
  y04 = __fpadd(y04, y05);
  y06 = __fpadd(y06, y07);
  y08 = __fpadd(y08, y09);
  y10 = __fpadd(y10, y11);
  y00 = __fpadd(y00, y02);
  y04 = __fpadd(y04, y06);
  y08 = __fpadd(y08, y10);
  y00 = __fpadd(y00, y04);
  y00 = __fpadd(y00, y08);
  res = __creal(y00)+__cimag(y00);
#  ifdef TM_USE_MPI
  if(parallel) {
    MPI_Allreduce(&res, &res2, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    return res2;
  }
#  endif
  return res;
}
예제 #18
0
파일: gsrb.flux.c 프로젝트: hpgmg/hpgmg
//------------------------------------------------------------------------------------------------------------------------------
void smooth(level_type * level, int x_id, int rhs_id, double a, double b){
  // allocate a buffer to hold fluxes...
  if(level->fluxes==NULL)level->fluxes = (double*)MALLOC( ( (4*level->num_threads)*(BLOCKCOPY_TILE_J+1)*(level->box_jStride) + BOX_ALIGN_JSTRIDE)*sizeof(double) );
  // align fluxes to BOX_ALIGN_JSTRIDE
  double * __restrict__ fluxes_aligned = level->fluxes;
  uint64_t unaligned_by = (uint64_t)(fluxes_aligned) & (BOX_ALIGN_JSTRIDE-1)*sizeof(double);
  if(unaligned_by)fluxes_aligned = (double*)( (uint64_t)(fluxes_aligned) + BOX_ALIGN_JSTRIDE*sizeof(double) - unaligned_by );


  int s;for(s=0;s<2*NUM_SMOOTHS;s++){ // there are two sweeps per GSRB smooth

  // exchange the ghost zone...
  if((s&1)==0){
    exchange_boundary(level,       x_id,stencil_get_shape());
            apply_BCs(level,       x_id,stencil_get_shape());
  }else{
    exchange_boundary(level,VECTOR_TEMP,stencil_get_shape());
            apply_BCs(level,VECTOR_TEMP,stencil_get_shape());
  }

  // apply the smoother...
  double _timeStart = getTime();
  double h2inv = 1.0/(level->h*level->h);

  // loop over all block/tiles this process owns...
  #ifdef _OPENMP
  #pragma omp parallel if(level->num_my_blocks>1)
  #endif
  {
    int block;
    int threadID=0;
    #ifdef _OPENMP
    threadID=omp_get_thread_num();
    #endif

    // [thread][flux][ij] layout
    double * __restrict__ flux_i    =  fluxes_aligned + (4*threadID + 0)*(BLOCKCOPY_TILE_J+1)*(level->box_jStride);
    double * __restrict__ flux_j    =  fluxes_aligned + (4*threadID + 1)*(BLOCKCOPY_TILE_J+1)*(level->box_jStride);
    double * __restrict__ flux_k[2] = {fluxes_aligned + (4*threadID + 2)*(BLOCKCOPY_TILE_J+1)*(level->box_jStride),
                                       fluxes_aligned + (4*threadID + 3)*(BLOCKCOPY_TILE_J+1)*(level->box_jStride)};


    // loop over (cache) blocks...
    #ifdef _OPENMP
    #pragma omp for schedule(static,1)
    #endif
    for(block=0;block<level->num_my_blocks;block++){
      const int box  = level->my_blocks[block].read.box;
      const int jlo  = level->my_blocks[block].read.j;
      const int klo  = level->my_blocks[block].read.k;
      const int jdim = level->my_blocks[block].dim.j;
      const int kdim = level->my_blocks[block].dim.k;

      const int ghosts  = level->my_boxes[box].ghosts;
      const int jStride = level->my_boxes[box].jStride;
      const int kStride = level->my_boxes[box].kStride;

      const double * __restrict__ rhs    = level->my_boxes[box].vectors[       rhs_id] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride);
      #ifdef VECTOR_ALPHA
      const double * __restrict__ alpha  = level->my_boxes[box].vectors[VECTOR_ALPHA ] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride);
      #else
      const double * __restrict__ alpha  = NULL;
      #endif
      const double * __restrict__ beta_i = level->my_boxes[box].vectors[VECTOR_BETA_I] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride);
      const double * __restrict__ beta_j = level->my_boxes[box].vectors[VECTOR_BETA_J] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride);
      const double * __restrict__ beta_k = level->my_boxes[box].vectors[VECTOR_BETA_K] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride);
      const double * __restrict__ Dinv   = level->my_boxes[box].vectors[VECTOR_DINV  ] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride);
      const double * __restrict__ x_n;
            double * __restrict__ x_np1;
                     if((s&1)==0){x_n    = level->my_boxes[box].vectors[         x_id] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride);
                                  x_np1  = level->my_boxes[box].vectors[VECTOR_TEMP  ] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride);}
                             else{x_n    = level->my_boxes[box].vectors[VECTOR_TEMP  ] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride);
                                  x_np1  = level->my_boxes[box].vectors[         x_id] + ghosts*(1+jStride+kStride) + (jlo*jStride + klo*kStride);}

      #ifdef __INTEL_COMPILER
      // superfluous with OMP4 simd (?)
      //__assume_aligned(x_n      ,BOX_ALIGN_JSTRIDE*sizeof(double));
      //__assume_aligned(x_np1    ,BOX_ALIGN_JSTRIDE*sizeof(double));
      //__assume_aligned(rhs      ,BOX_ALIGN_JSTRIDE*sizeof(double));
      //__assume_aligned(alpha    ,BOX_ALIGN_JSTRIDE*sizeof(double));
      //__assume_aligned(beta_i   ,BOX_ALIGN_JSTRIDE*sizeof(double));
      //__assume_aligned(beta_j   ,BOX_ALIGN_JSTRIDE*sizeof(double));
      //__assume_aligned(beta_k   ,BOX_ALIGN_JSTRIDE*sizeof(double));
      //__assume_aligned(Dinv     ,BOX_ALIGN_JSTRIDE*sizeof(double));
      //__assume_aligned(flux_i   ,BOX_ALIGN_JSTRIDE*sizeof(double));
      //__assume_aligned(flux_j   ,BOX_ALIGN_JSTRIDE*sizeof(double));
      //__assume_aligned(flux_k[0],BOX_ALIGN_JSTRIDE*sizeof(double));
      //__assume_aligned(flux_k[1],BOX_ALIGN_JSTRIDE*sizeof(double));
      __assume(           jStride % BOX_ALIGN_JSTRIDE == 0); // e.g. jStride%4==0 or jStride%8==0, hence x+jStride is aligned
      __assume(           kStride % BOX_ALIGN_JSTRIDE == 0);
      __assume(             jStride >=   BOX_ALIGN_JSTRIDE);
      __assume(             kStride >= 3*BOX_ALIGN_JSTRIDE);
      __assume(                                   jdim > 0);
      __assume(                                   kdim > 0);
      #elif __xlC__
      __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), rhs      );
      __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), alpha    );
      __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), beta_i   );
      __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), beta_j   );
      __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), beta_k   );
      __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), Dinv     );
      __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), x_n      );
      __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), x_np1    );
      __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), flux_i   );
      __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), flux_j   );
      __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), flux_k[0]);
      __alignx(BOX_ALIGN_JSTRIDE*sizeof(double), flux_k[1]);
      #endif


      int ij,k;
      double * __restrict__ flux_klo = flux_k[0];
      // startup / prolog... calculate flux_klo (bottom of cell)...
      #if (_OPENMP>=201307)
      #pragma omp simd aligned(beta_k,x_n,flux_klo:BOX_ALIGN_JSTRIDE*sizeof(double))
      #endif
      for(ij=0;ij<jdim*jStride;ij++){
        flux_klo[ij] = beta_dxdk(x_n,ij); // k==0
      }


      // wavefront loop...
      for(k=0;k<kdim;k++){
        double * __restrict__ flux_klo = flux_k[(k  )&0x1];
        double * __restrict__ flux_khi = flux_k[(k+1)&0x1];


        // calculate flux_i and flux_j together
        #if (_OPENMP>=201307)
        #pragma omp simd aligned(beta_i,beta_j,x_n,flux_i,flux_j:BOX_ALIGN_JSTRIDE*sizeof(double))
        #endif
        for(ij=0;ij<jdim*jStride;ij++){
          int ijk = ij + k*kStride;
          flux_i[ij] = beta_dxdi(x_n,ijk);
          flux_j[ij] = beta_dxdj(x_n,ijk);
        }


        // calculate flux_jhi
        #if (_OPENMP>=201307)
        #pragma omp simd aligned(beta_j,x_n,flux_j:BOX_ALIGN_JSTRIDE*sizeof(double))
        #endif
        for(ij=jdim*jStride;ij<(jdim+1)*jStride;ij++){
          int ijk = ij + k*kStride;
          flux_j[ij] = beta_dxdj(x_n,ijk);
        }


        // calculate flux_khi (top of cell)
        #if (_OPENMP>=201307)
        #pragma omp simd aligned(beta_k,x_n,flux_khi:BOX_ALIGN_JSTRIDE*sizeof(double))
        #endif
        for(ij=0;ij<jdim*jStride;ij++){
          int ijk = ij + k*kStride;
          flux_khi[ij] = beta_dxdk(x_n,ijk+kStride); // k+1
        }


        const int color000 = (level->my_boxes[box].low.i^level->my_boxes[box].low.j^level->my_boxes[box].low.k^jlo^klo^s);  // is element 000 of this *BLOCK* 000 red or black on this sweep
        const double * __restrict__ RedBlack = level->RedBlack_FP + ghosts*(1+jStride) + jStride*((k^color000)&0x1); // Red/Black pencils... presumes ghost zones were corectly colored
        #if (_OPENMP>=201307)
        #pragma omp simd aligned(flux_i,flux_j,flux_klo,flux_khi,alpha,rhs,Dinv,x_n,x_np1,RedBlack:BOX_ALIGN_JSTRIDE*sizeof(double)) 
        #endif
        #ifdef __INTEL_COMPILER
        #pragma vector nontemporal // generally, we don't expect to reuse x_np1
        #endif
        for(ij=0;ij<jdim*jStride;ij++){
          int ijk = ij + k*kStride;
          double Lx = - flux_i[  ij] + flux_i[  ij+      1]
                      - flux_j[  ij] + flux_j[  ij+jStride]
                      - flux_klo[ij] + flux_khi[ij        ];
          #ifdef USE_HELMHOLTZ
          double Ax = a*alpha[ijk]*x_n[ijk] - b*Lx;
          #else
          double Ax = -b*Lx;
          #endif
          x_np1[ijk] = x_n[ijk] + RedBlack[ij]*Dinv[ijk]*(rhs[ijk]-Ax);
        }


      } // kdim

    } // block
  } // omp
  level->timers.smooth += (double)(getTime()-_timeStart);

  } // s-loop
}
예제 #19
0
/* 32-2. */
void xchange_halffield32() {

#  ifdef MPI

  MPI_Request requests[16];
  MPI_Status status[16];
#  ifdef PARALLELT
  int reqcount = 4;
#  elif defined PARALLELXT
  int reqcount = 8;
#  elif defined PARALLELXYT
  int reqcount = 12;
#  elif defined PARALLELXYZT
  int reqcount = 16;
#  endif
#ifdef _KOJAK_INST
#pragma pomp inst begin(xchangehalf32)
#endif
#  if (defined XLC && defined BGL)
  __alignx(16, HalfSpinor32);
#  endif

  /* send the data to the neighbour on the right in t direction */
  /* recieve the data from the neighbour on the left in t direction */
  MPI_Isend((void*)(HalfSpinor32 + 4*VOLUME), LX*LY*LZ*12/2, MPI_FLOAT, 
	    g_nb_t_up, 81, g_cart_grid, &requests[0]);
	 MPI_Irecv((void*)(HalfSpinor32 + 4*VOLUME + RAND/2 + LX*LY*LZ/2), LX*LY*LZ*12/2, MPI_FLOAT, 
	    g_nb_t_dn, 81, g_cart_grid, &requests[1]);

  /* send the data to the neighbour on the left in t direction */
  /* recieve the data from the neighbour on the right in t direction */
  MPI_Isend((void*)(HalfSpinor32 + 4*VOLUME + LX*LY*LZ/2), LX*LY*LZ*12/2, MPI_FLOAT, 
	    g_nb_t_dn, 82, g_cart_grid, &requests[2]);
	 MPI_Irecv((void*)(HalfSpinor32 + 4*VOLUME + RAND/2), LX*LY*LZ*12/2, MPI_FLOAT, 
	    g_nb_t_up, 82, g_cart_grid, &requests[3]);

#    if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)

  /* send the data to the neighbour on the right in x direction */
  /* recieve the data from the neighbour on the left in x direction */
  MPI_Isend((void*)(HalfSpinor32 + 4*VOLUME + LX*LY*LZ), T*LY*LZ*12/2, MPI_FLOAT, 
	    g_nb_x_up, 91, g_cart_grid, &requests[4]);
	 MPI_Irecv((void*)(HalfSpinor32 + 4*VOLUME + RAND/2 + LX*LY*LZ + T*LY*LZ/2), T*LY*LZ*12/2, MPI_FLOAT,
	    g_nb_x_dn, 91, g_cart_grid, &requests[5]);

  /* send the data to the neighbour on the left in x direction */
  /* recieve the data from the neighbour on the right in x direction */  
  MPI_Isend((void*)(HalfSpinor32 + 4*VOLUME + LX*LY*LZ + T*LY*LZ/2), T*LY*LZ*12/2, MPI_FLOAT,
 	    g_nb_x_dn, 92, g_cart_grid, &requests[6]);
	 MPI_Irecv((void*)(HalfSpinor32 + 4*VOLUME + RAND/2 + LX*LY*LZ), T*LY*LZ*12/2, MPI_FLOAT,
 	    g_nb_x_up, 92, g_cart_grid, &requests[7]);
#    endif
    
#    if (defined PARALLELXYT || defined PARALLELXYZT)
    /* send the data to the neighbour on the right in y direction */
    /* recieve the data from the neighbour on the left in y direction */
  MPI_Isend((void*)(HalfSpinor32 + 4*VOLUME + LX*LY*LZ + T*LY*LZ), T*LX*LZ*12/2, MPI_FLOAT, 
	    g_nb_y_up, 101, g_cart_grid, &requests[8]);
	 MPI_Irecv((void*)(HalfSpinor32 + 4*VOLUME + RAND/2 + LX*LY*LZ + T*LY*LZ + T*LX*LZ/2), T*LX*LZ*12/2, MPI_FLOAT, 
	    g_nb_y_dn, 101, g_cart_grid, &requests[9]);

    /* send the data to the neighbour on the leftt in y direction */
    /* recieve the data from the neighbour on the right in y direction */
  MPI_Isend((void*)(HalfSpinor32 + 4*VOLUME + LX*LY*LZ + T*LY*LZ + T*LX*LZ/2), T*LX*LZ*12/2, MPI_FLOAT, 
	    g_nb_y_dn, 102, g_cart_grid, &requests[10]);
	 MPI_Irecv((void*)(HalfSpinor32 + 4*VOLUME + RAND/2 + LX*LY*LZ + T*LY*LZ), T*LX*LZ*12/2, MPI_FLOAT, 
	    g_nb_y_up, 102, g_cart_grid, &requests[11]);
#    endif
    
#    if (defined PARALLELXYZT)
  /* send the data to the neighbour on the right in z direction */
  /* recieve the data from the neighbour on the left in z direction */
  MPI_Isend((void*)(HalfSpinor32 + 4*VOLUME + LX*LY*LZ + T*LY*LZ + T*LX*LZ), 
	    T*LX*LY*12/2, MPI_FLOAT, g_nb_z_up, 503, g_cart_grid, &requests[12]);
	 MPI_Irecv((void*)(HalfSpinor32 + 4*VOLUME + RAND/2 + LX*LY*LZ + T*LY*LZ + T*LX*LZ + T*LX*LY/2), 
	    T*LX*LY*12/2, MPI_FLOAT, g_nb_z_dn, 503, g_cart_grid, &requests[13]); 

  /* send the data to the neighbour on the left in z direction */
  /* recieve the data from the neighbour on the right in z direction */
  MPI_Isend((void*)(HalfSpinor32 + 4*VOLUME + LX*LY*LZ + T*LY*LZ + T*LX*LZ + T*LX*LY/2), 
	    12*T*LX*LY/2, MPI_FLOAT, g_nb_z_dn, 504, g_cart_grid, &requests[14]);
	 MPI_Irecv((void*)(HalfSpinor32 + 4*VOLUME + RAND/2 + LX*LY*LZ + T*LY*LZ + T*LX*LZ), 
	    T*LX*LY*12/2, MPI_FLOAT, g_nb_z_up, 504, g_cart_grid, &requests[15]); 
#    endif

  MPI_Waitall(reqcount, requests, status); 
#  endif /* MPI */
  return;
#ifdef _KOJAK_INST
#pragma pomp inst end(xchangehalf32)
#endif
}
예제 #20
0
void xchange_2fields(spinor * const l, spinor * const k, const int ieo) {

    MPI_Request requests[32];
    MPI_Status status[32];
    int reqcount = 0;
#if defined PARALLELXYZT
    int ix=0;
#endif

#ifdef _KOJAK_INST
#pragma pomp inst begin(xchange2fields)
#endif

#  ifdef MPI

#  if (defined BGL && defined XLC)
#    ifdef PARALLELXYZT
    __alignx(16, field_buffer_z);
    __alignx(16, field_buffer_z2);
    __alignx(16, field_buffer_z3);
    __alignx(16, field_buffer_z4);
#    endif
    __alignx(16, l);
#  endif

    /* send the data to the neighbour on the left */
    /* recieve the data from the neighbour on the right */
    MPI_Isend((void*)l, 1, field_time_slice_cont, g_nb_t_dn, 81, g_cart_grid, &requests[reqcount]);
    MPI_Irecv((void*)(l+T*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_up, 81, g_cart_grid, &requests[reqcount+1]);
    reqcount=reqcount+2;

    /* send the data to the neighbour on the right */
    /* recieve the data from the neighbour on the left */
    MPI_Isend((void*)(l+(T-1)*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_up, 82, g_cart_grid, &requests[reqcount]);
    MPI_Irecv((void*)(l+(T+1)*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_dn, 82, g_cart_grid, &requests[reqcount+1]);
    reqcount=reqcount+2;

    /* send the data to the neighbour on the left */
    /* recieve the data from the neighbour on the right */
    MPI_Isend((void*)k, 1, field_time_slice_cont, g_nb_t_dn, 83, g_cart_grid, &requests[reqcount]);
    MPI_Irecv((void*)(k+T*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_up, 83, g_cart_grid, &requests[reqcount+1]);
    reqcount=reqcount+2;

    /* send the data to the neighbour on the right */
    /* recieve the data from the neighbour on the left */
    MPI_Isend((void*)(k+(T-1)*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_up, 84, g_cart_grid, &requests[reqcount]);
    MPI_Irecv((void*)(k+(T+1)*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_dn, 84, g_cart_grid, &requests[reqcount+1]);
    reqcount=reqcount+2;


#    if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT)
    /* send the data to the neighbour on the left in x direction */
    /* recieve the data from the neighbour on the right in x direction */
    MPI_Isend((void*)l, 1, field_x_slice_gath, g_nb_x_dn, 91, g_cart_grid,  &requests[reqcount]);
    MPI_Irecv((void*)(l+(T+2)*LX*LY*LZ/2), 1, field_x_slice_cont, g_nb_x_up, 91, g_cart_grid, &requests[reqcount+1]);
    reqcount=reqcount+2;

    /* send the data to the neighbour on the right in x direction */
    /* recieve the data from the neighbour on the left in x direction */
    MPI_Isend((void*)(l+(LX-1)*LY*LZ/2), 1, field_x_slice_gath, g_nb_x_up, 92, g_cart_grid, &requests[reqcount]);
    MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + T*LY*LZ)/2), 1, field_x_slice_cont, g_nb_x_dn, 92, g_cart_grid, &requests[reqcount+1]);
    reqcount=reqcount+2;

    /* send the data to the neighbour on the left in x direction */
    /* recieve the data from the neighbour on the right in x direction */
    MPI_Isend((void*)k, 1, field_x_slice_gath, g_nb_x_dn, 93, g_cart_grid,  &requests[reqcount]);
    MPI_Irecv((void*)(k+(T+2)*LX*LY*LZ/2), 1, field_x_slice_cont, g_nb_x_up, 93, g_cart_grid, &requests[reqcount+1]);
    reqcount=reqcount+2;

    /* send the data to the neighbour on the right in x direction */
    /* recieve the data from the neighbour on the left in x direction */
    MPI_Isend((void*)(k+(LX-1)*LY*LZ/2), 1, field_x_slice_gath, g_nb_x_up, 94, g_cart_grid, &requests[reqcount]);
    MPI_Irecv((void*)(k+((T+2)*LX*LY*LZ + T*LY*LZ)/2), 1, field_x_slice_cont, g_nb_x_dn, 94, g_cart_grid, &requests[reqcount+1]);
    reqcount=reqcount+2;
#    endif

#    if (defined PARALLELXYT || defined PARALLELXYZT)
    /* send the data to the neighbour on the left in y direction */
    /* recieve the data from the neighbour on the right in y direction */
    MPI_Isend((void*)l, 1, field_y_slice_gath, g_nb_y_dn, 101, g_cart_grid, &requests[reqcount]);
    MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + 2*T*LY*LZ)/2), 1, field_y_slice_cont, g_nb_y_up, 101, g_cart_grid, &requests[reqcount+1]);
    reqcount=reqcount+2;

    /* send the data to the neighbour on the right in y direction */
    /* recieve the data from the neighbour on the left in y direction */
    MPI_Isend((void*)(l+(LY-1)*LZ/2), 1, field_y_slice_gath, g_nb_y_up, 102, g_cart_grid, &requests[reqcount]);
    MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + 2*T*LY*LZ + T*LX*LZ)/2), 1, field_y_slice_cont, g_nb_y_dn, 102, g_cart_grid, &requests[reqcount+1]);
    reqcount=reqcount+2;

    /* send the data to the neighbour on the left in y direction */
    /* recieve the data from the neighbour on the right in y direction */
    MPI_Isend((void*)k, 1, field_y_slice_gath, g_nb_y_dn, 103, g_cart_grid, &requests[reqcount]);
    MPI_Irecv((void*)(k+((T+2)*LX*LY*LZ + 2*T*LY*LZ)/2), 1, field_y_slice_cont, g_nb_y_up, 103, g_cart_grid, &requests[reqcount+1]);
    reqcount=reqcount+2;

    /* send the data to the neighbour on the right in y direction */
    /* recieve the data from the neighbour on the left in y direction */
    MPI_Isend((void*)(k+(LY-1)*LZ/2), 1, field_y_slice_gath, g_nb_y_up, 104, g_cart_grid, &requests[reqcount]);
    MPI_Irecv((void*)(k+((T+2)*LX*LY*LZ + 2*T*LY*LZ + T*LX*LZ)/2), 1, field_y_slice_cont, g_nb_y_dn, 104, g_cart_grid, &requests[reqcount+1]);
    reqcount=reqcount+2;

#    endif

#    if (defined PARALLELXYZT)
    /* fill buffer ! */
    /* This is now depending on whether the field is */
    /* even or odd */
    if(ieo == 1) {
        for(ix = 0; ix < T*LX*LY/2; ix++) {
            field_buffer_z[ix] = l[ g_field_z_ipt_even[ix] ];
        }
    }
    else {
        for(ix = 0; ix < T*LX*LY/2; ix++) {
            field_buffer_z[ix] = l[ g_field_z_ipt_odd[ix] ];
        }
    }
    if(ieo == 1) {
        for(ix = T*LX*LY/2; ix < T*LX*LY; ix++) {
            field_buffer_z2[ix-T*LX*LY/2] = l[ g_field_z_ipt_even[ix] ];
        }
    }
    else {
        for(ix = T*LX*LY/2; ix < T*LX*LY; ix++) {
            field_buffer_z2[ix-T*LX*LY/2] = l[ g_field_z_ipt_odd[ix] ];
        }
    }
    /* send the data to the neighbour on the left in z direction */
    /* recieve the data from the neighbour on the right in z direction */
    MPI_Isend((void*)field_buffer_z, 12*T*LX*LY, MPI_DOUBLE, g_nb_z_dn, 503, g_cart_grid, &requests[reqcount]);
    MPI_Irecv((void*)(l+(VOLUME/2 + LX*LY*LZ + T*LY*LZ +T*LX*LZ)), 12*T*LX*LY, MPI_DOUBLE, g_nb_z_up, 503, g_cart_grid, &requests[reqcount+1]);
    reqcount=reqcount+2;

    /* send the data to the neighbour on the right in y direction */
    /* recieve the data from the neighbour on the left in y direction */
    MPI_Isend((void*)field_buffer_z2, 12*T*LX*LY, MPI_DOUBLE, g_nb_z_up, 504, g_cart_grid, &requests[reqcount]);
    MPI_Irecv((void*)(l+(VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + T*LX*LY)/2), 12*T*LX*LY, MPI_DOUBLE, g_nb_z_dn, 504, g_cart_grid, &requests[reqcount+1]);
    reqcount=reqcount+2;

    /* fill buffer ! */
    /* This is now depending on whether the field is */
    /* even or odd */
    if(ieo == 0) {
        for(ix = 0; ix < T*LX*LY/2; ix++) {
            field_buffer_z3[ix] = k[ g_field_z_ipt_even[ix] ];
        }
    }
    else {
        for(ix = 0; ix < T*LX*LY/2; ix++) {
            field_buffer_z3[ix] = k[ g_field_z_ipt_odd[ix] ];
        }
    }
    if(ieo == 0) {
        for(ix = T*LX*LY/2; ix < T*LX*LY; ix++) {
            field_buffer_z4[ix-T*LX*LY/2] = k[ g_field_z_ipt_even[ix] ];
        }
    }
    else {
        for(ix = T*LX*LY/2; ix < T*LX*LY; ix++) {
            field_buffer_z4[ix-T*LX*LY/2] = k[ g_field_z_ipt_odd[ix] ];
        }
    }
    /* send the data to the neighbour on the left in z direction */
    /* recieve the data from the neighbour on the right in z direction */
    MPI_Isend((void*)field_buffer_z3, 12*T*LX*LY, MPI_DOUBLE, g_nb_z_dn, 505, g_cart_grid, &requests[reqcount]);
    MPI_Irecv((void*)(k+(VOLUME/2 + LX*LY*LZ + T*LY*LZ +T*LX*LZ)), 12*T*LX*LY, MPI_DOUBLE, g_nb_z_up, 505, g_cart_grid, &requests[reqcount+1]);
    reqcount=reqcount+2;

    /* send the data to the neighbour on the right in y direction */
    /* recieve the data from the neighbour on the left in y direction */
    MPI_Isend((void*)field_buffer_z4, 12*T*LX*LY, MPI_DOUBLE, g_nb_z_up, 506, g_cart_grid, &requests[reqcount]);
    MPI_Irecv((void*)(k+(VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + T*LX*LY)/2), 12*T*LX*LY, MPI_DOUBLE, g_nb_z_dn, 506, g_cart_grid, &requests[reqcount+1]);
    reqcount=reqcount+2;


#    endif


    MPI_Waitall(reqcount, requests, status);
#  endif
    return;
#ifdef _KOJAK_INST
#pragma pomp inst end(xchange2fields)
#endif
}
예제 #21
0
void xchange_2fields(spinor * const l, spinor * const k, const int ieo) {

#ifdef MPI
    MPI_Request requests[32];
    MPI_Status status[32];
#endif
    int reqcount = 0;
#if defined PARALLELXYZT
    int ix=0;
#endif

#ifdef _KOJAK_INST
#pragma pomp inst begin(xchange2fields)
#endif

#  ifdef MPI

#  if (defined BGL && defined XLC)
    __alignx(16, l);
#  endif

#    if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT )
    /* send the data to the neighbour on the left */
    /* recieve the data from the neighbour on the right */
    MPI_Isend((void*)(l+g_1st_t_int_dn), 1, field_time_slice_cont, g_nb_t_dn, 81, g_cart_grid, &requests[reqcount]);
    MPI_Irecv((void*)(l+g_1st_t_ext_up), 1, field_time_slice_cont, g_nb_t_up, 81, g_cart_grid, &requests[reqcount+1]);
    reqcount=reqcount+2;

    /* send the data to the neighbour on the right */
    /* recieve the data from the neighbour on the left */
    MPI_Isend((void*)(l+g_1st_t_int_up), 1, field_time_slice_cont, g_nb_t_up, 82, g_cart_grid, &requests[reqcount]);
    MPI_Irecv((void*)(l+g_1st_t_ext_dn), 1, field_time_slice_cont, g_nb_t_dn, 82, g_cart_grid, &requests[reqcount+1]);
    reqcount=reqcount+2;

    /* send the data to the neighbour on the left */
    /* recieve the data from the neighbour on the right */
    MPI_Isend((void*)(k+g_1st_t_int_dn), 1, field_time_slice_cont, g_nb_t_dn, 83, g_cart_grid, &requests[reqcount]);
    MPI_Irecv((void*)(k+g_1st_t_ext_up), 1, field_time_slice_cont, g_nb_t_up, 83, g_cart_grid, &requests[reqcount+1]);
    reqcount=reqcount+2;

    /* send the data to the neighbour on the right */
    /* recieve the data from the neighbour on the left */
    MPI_Isend((void*)(k+g_1st_t_int_up), 1, field_time_slice_cont, g_nb_t_up, 84, g_cart_grid, &requests[reqcount]);
    MPI_Irecv((void*)(k+g_1st_t_int_dn), 1, field_time_slice_cont, g_nb_t_dn, 84, g_cart_grid, &requests[reqcount+1]);
    reqcount=reqcount+2;
#    endif

#    if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ )
    /* send the data to the neighbour on the left in x direction */
    /* recieve the data from the neighbour on the right in x direction */
    MPI_Isend((void*)(l+g_1st_x_int_dn), 1, field_x_slice_gath, g_nb_x_dn, 91, g_cart_grid,  &requests[reqcount]);
    MPI_Irecv((void*)(l+g_1st_x_ext_up), 1, field_x_slice_cont, g_nb_x_up, 91, g_cart_grid, &requests[reqcount+1]);
    reqcount=reqcount+2;

    /* send the data to the neighbour on the right in x direction */
    /* recieve the data from the neighbour on the left in x direction */
    MPI_Isend((void*)(l+g_1st_x_int_up), 1, field_x_slice_gath, g_nb_x_up, 92, g_cart_grid, &requests[reqcount]);
    MPI_Irecv((void*)(l+g_1st_x_ext_dn), 1, field_x_slice_cont, g_nb_x_dn, 92, g_cart_grid, &requests[reqcount+1]);
    reqcount=reqcount+2;

    /* send the data to the neighbour on the left in x direction */
    /* recieve the data from the neighbour on the right in x direction */
    MPI_Isend((void*)(k+g_1st_x_int_dn), 1, field_x_slice_gath, g_nb_x_dn, 93, g_cart_grid,  &requests[reqcount]);
    MPI_Irecv((void*)(k+g_1st_x_ext_up), 1, field_x_slice_cont, g_nb_x_up, 93, g_cart_grid, &requests[reqcount+1]);
    reqcount=reqcount+2;

    /* send the data to the neighbour on the right in x direction */
    /* recieve the data from the neighbour on the left in x direction */
    MPI_Isend((void*)(k+g_1st_x_int_up), 1, field_x_slice_gath, g_nb_x_up, 94, g_cart_grid, &requests[reqcount]);
    MPI_Irecv((void*)(k+g_1st_x_ext_dn), 1, field_x_slice_cont, g_nb_x_dn, 94, g_cart_grid, &requests[reqcount+1]);
    reqcount=reqcount+2;
#    endif

#    if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ )
    /* send the data to the neighbour on the left in y direction */
    /* recieve the data from the neighbour on the right in y direction */
    MPI_Isend((void*)(l+g_1st_y_int_dn), 1, field_y_slice_gath, g_nb_y_dn, 101, g_cart_grid, &requests[reqcount]);
    MPI_Irecv((void*)(l+g_1st_y_ext_up), 1, field_y_slice_cont, g_nb_y_up, 101, g_cart_grid, &requests[reqcount+1]);
    reqcount=reqcount+2;

    /* send the data to the neighbour on the right in y direction */
    /* recieve the data from the neighbour on the left in y direction */
    MPI_Isend((void*)(l+g_1st_y_int_up), 1, field_y_slice_gath, g_nb_y_up, 102, g_cart_grid, &requests[reqcount]);
    MPI_Irecv((void*)(l+g_1st_y_ext_dn), 1, field_y_slice_cont, g_nb_y_dn, 102, g_cart_grid, &requests[reqcount+1]);
    reqcount=reqcount+2;

    /* send the data to the neighbour on the left in y direction */
    /* recieve the data from the neighbour on the right in y direction */
    MPI_Isend((void*)(k+g_1st_y_int_dn), 1, field_y_slice_gath, g_nb_y_dn, 103, g_cart_grid, &requests[reqcount]);
    MPI_Irecv((void*)(k+g_1st_y_ext_up), 1, field_y_slice_cont, g_nb_y_up, 103, g_cart_grid, &requests[reqcount+1]);
    reqcount=reqcount+2;

    /* send the data to the neighbour on the right in y direction */
    /* recieve the data from the neighbour on the left in y direction */
    MPI_Isend((void*)(k+g_1st_y_int_up), 1, field_y_slice_gath, g_nb_y_up, 104, g_cart_grid, &requests[reqcount]);
    MPI_Irecv((void*)(k+g_1st_y_ext_dn), 1, field_y_slice_cont, g_nb_y_dn, 104, g_cart_grid, &requests[reqcount+1]);
    reqcount=reqcount+2;

#    endif

#    if (defined PARALLELXYZ || defined PARALLELXYZT)
    /* send the data to the neighbour on the left in z direction */
    /* recieve the data from the neighbour on the right in z direction */
    if(ieo == 1) {
        MPI_Isend((void*)(l+g_1st_z_int_dn),1,field_z_slice_even_dn,g_nb_z_dn,503,g_cart_grid,&requests[reqcount]);
        MPI_Irecv((void*)(l+g_1st_z_ext_up),1,field_z_slice_cont,g_nb_z_up,503,g_cart_grid,&requests[reqcount+1]);
        reqcount=reqcount+2;
    } else {
        MPI_Isend((void*)(l+g_1st_z_int_dn),1,field_z_slice_odd_dn,g_nb_z_dn,503,g_cart_grid,&requests[reqcount]);
        MPI_Irecv((void*)(l+g_1st_z_ext_up),1,field_z_slice_cont,g_nb_z_up,503,g_cart_grid,&requests[reqcount+1]);
        reqcount=reqcount+2;
    }
    if(ieo == 1) {
        MPI_Isend((void*)(k+g_1st_z_int_dn),1,field_z_slice_even_dn,g_nb_z_dn,505,g_cart_grid,&requests[reqcount]);
        MPI_Irecv((void*)(k+g_1st_z_ext_up),1,field_z_slice_cont,g_nb_z_up,505,g_cart_grid,&requests[reqcount+1]);
        reqcount=reqcount+2;
    } else {
        MPI_Isend((void*)(k+g_1st_z_int_dn),1,field_z_slice_odd_dn,g_nb_z_dn,505,g_cart_grid,&requests[reqcount]);
        MPI_Irecv((void*)(k+g_1st_z_ext_up),1,field_z_slice_cont,g_nb_z_up,505,g_cart_grid,&requests[reqcount+1]);
        reqcount=reqcount+2;
    }

    /* send the data to the neighbour on the right in z direction */
    /* recieve the data from the neighbour on the left in z direction */
    if(ieo == 1) {
        MPI_Isend((void*)(l+g_1st_z_int_up),1,field_z_slice_even_up,g_nb_z_up,504,g_cart_grid,&requests[reqcount]);
        MPI_Irecv((void*)(l+g_1st_z_ext_dn),1,field_z_slice_cont,g_nb_z_dn,504,g_cart_grid,&requests[reqcount+1]);
        reqcount=reqcount+2;
    } else {
        MPI_Isend((void*)(l+g_1st_z_int_up),1,field_z_slice_odd_up,g_nb_z_up,504,g_cart_grid,&requests[reqcount]);
        MPI_Irecv((void*)(l+g_1st_z_ext_dn),1,field_z_slice_cont,g_nb_z_dn,504,g_cart_grid,&requests[reqcount+1]);
        reqcount=reqcount+2;
    }
    if(ieo == 1) {
        MPI_Isend((void*)(k+g_1st_z_int_up),1,field_z_slice_even_up,g_nb_z_up,506,g_cart_grid,&requests[reqcount]);
        MPI_Irecv((void*)(k+g_1st_z_ext_dn),1,field_z_slice_cont,g_nb_z_dn,506,g_cart_grid,&requests[reqcount+1]);
        reqcount=reqcount+2;
    } else {
        MPI_Isend((void*)(k+g_1st_z_int_up),1,field_z_slice_odd_up,g_nb_z_up,506,g_cart_grid,&requests[reqcount]);
        MPI_Irecv((void*)(k+g_1st_z_ext_dn),1,field_z_slice_cont,g_nb_z_dn,506,g_cart_grid,&requests[reqcount+1]);
        reqcount=reqcount+2;
    }

#    endif


    MPI_Waitall(reqcount, requests, status);
#  endif
    return;
#ifdef _KOJAK_INST
#pragma pomp inst end(xchange2fields)
#endif
}
예제 #22
0
void deriv_Sb_D_psi(spinor * const l, spinor * const k, 
		    hamiltonian_field_t * const hf, const double factor) {
#ifdef BGL
  __alignx(16, l);
  __alignx(16, k);
#endif

  /* for parallelization */
#ifdef MPI
  xchange_lexicfield(k);
  xchange_lexicfield(l);
#endif
  
#ifdef OMP
#define static
#pragma omp parallel
  {
#endif
  
  int ix,iy;
  su3 * restrict up ALIGN;
  su3 * restrict um ALIGN;
  static su3 v1,v2;
  static su3_vector psia,psib,phia,phib;
  static spinor rr;
/*   spinor * restrict r ALIGN; */
  spinor * restrict sp ALIGN;
  spinor * restrict sm ALIGN;

#ifdef OMP
#undef static
#endif

#ifdef _KOJAK_INST
#pragma pomp inst begin(derivSb)
#endif
#ifdef XLC
#pragma disjoint(*sp, *sm, *up, *um)
#endif

  /************** loop over all lattice sites ****************/
#ifdef OMP
#pragma omp for
#endif
  for(ix = 0; ix < (VOLUME); ix++){
    rr = (*(l + ix));
    /*     rr=g_spinor_field[l][icx-ioff]; */

    /*multiply the left vector with gamma5*/
    _vector_minus_assign(rr.s2, rr.s2);
    _vector_minus_assign(rr.s3, rr.s3);

    /*********************** direction +0 ********************/

    iy=g_iup[ix][0];

    sp = k + iy;
    up=&hf->gaugefield[ix][0];
      
    _vector_add(psia,(*sp).s0,(*sp).s2);
    _vector_add(psib,(*sp).s1,(*sp).s3);
      
    _vector_add(phia,rr.s0,rr.s2);
    _vector_add(phib,rr.s1,rr.s3);

    _vector_tensor_vector_add(v1, phia, psia, phib, psib);
    _su3_times_su3d(v2,*up,v1);
    _complex_times_su3(v1,ka0,v2);
    _trace_lambda_mul_add_assign_nonlocal(hf->derivative[ix][0], 2.*factor, v1);

    /************** direction -0 ****************************/

    iy=g_idn[ix][0];

    sm = k + iy;
    um=&hf->gaugefield[iy][0];
      
    _vector_sub(psia,(*sm).s0,(*sm).s2);
    _vector_sub(psib,(*sm).s1,(*sm).s3);

    _vector_sub(phia,rr.s0,rr.s2);
    _vector_sub(phib,rr.s1,rr.s3);

    _vector_tensor_vector_add(v1, psia, phia, psib, phib);
    _su3_times_su3d(v2,*um,v1);
    _complex_times_su3(v1,ka0,v2);
    _trace_lambda_mul_add_assign_nonlocal(hf->derivative[iy][0], 2.*factor, v1);

    /*************** direction +1 **************************/

    iy=g_iup[ix][1];

    sp = k + iy;
    up=&hf->gaugefield[ix][1];      

    _vector_i_add(psia,(*sp).s0,(*sp).s3);
    _vector_i_add(psib,(*sp).s1,(*sp).s2);

    _vector_i_add(phia,rr.s0,rr.s3);
    _vector_i_add(phib,rr.s1,rr.s2);

    _vector_tensor_vector_add(v1, phia, psia, phib, psib);
    _su3_times_su3d(v2,*up,v1);
    _complex_times_su3(v1,ka1,v2);
    _trace_lambda_mul_add_assign_nonlocal(hf->derivative[ix][1], 2.*factor, v1);

    /**************** direction -1 *************************/

    iy=g_idn[ix][1];

    sm = k + iy;
    um=&hf->gaugefield[iy][1];
      
    _vector_i_sub(psia,(*sm).s0,(*sm).s3);
    _vector_i_sub(psib,(*sm).s1,(*sm).s2);

    _vector_i_sub(phia,rr.s0,rr.s3);
    _vector_i_sub(phib,rr.s1,rr.s2);

    _vector_tensor_vector_add(v1, psia, phia, psib, phib);
    _su3_times_su3d(v2,*um,v1);
    _complex_times_su3(v1,ka1,v2);
    _trace_lambda_mul_add_assign_nonlocal(hf->derivative[iy][1], 2.*factor, v1);

    /*************** direction +2 **************************/

    iy=g_iup[ix][2];

    sp = k + iy;
    up=&hf->gaugefield[ix][2];
      
    _vector_add(psia,(*sp).s0,(*sp).s3);
    _vector_sub(psib,(*sp).s1,(*sp).s2);
      
    _vector_add(phia,rr.s0,rr.s3);
    _vector_sub(phib,rr.s1,rr.s2);

    _vector_tensor_vector_add(v1, phia, psia, phib, psib);
    _su3_times_su3d(v2,*up,v1);
    _complex_times_su3(v1,ka2,v2);
    _trace_lambda_mul_add_assign_nonlocal(hf->derivative[ix][2], 2.*factor, v1);

    /***************** direction -2 ************************/

    iy=g_idn[ix][2];

    sm = k + iy;
    um=&hf->gaugefield[iy][2];
      
    _vector_sub(psia,(*sm).s0,(*sm).s3);
    _vector_add(psib,(*sm).s1,(*sm).s2);

    _vector_sub(phia,rr.s0,rr.s3);
    _vector_add(phib,rr.s1,rr.s2);

    _vector_tensor_vector_add(v1, psia, phia, psib, phib);
    _su3_times_su3d(v2,*um,v1);
    _complex_times_su3(v1,ka2,v2);
    _trace_lambda_mul_add_assign_nonlocal(hf->derivative[iy][2], 2.*factor, v1);

    /****************** direction +3 ***********************/

    iy=g_iup[ix][3];

    sp = k + iy;
    up=&hf->gaugefield[ix][3];
      
    _vector_i_add(psia,(*sp).s0,(*sp).s2);
    _vector_i_sub(psib,(*sp).s1,(*sp).s3);

    _vector_i_add(phia,rr.s0,rr.s2);
    _vector_i_sub(phib,rr.s1,rr.s3);

    _vector_tensor_vector_add(v1, phia, psia, phib, psib);
    _su3_times_su3d(v2,*up,v1);
    _complex_times_su3(v1,ka3,v2);
    _trace_lambda_mul_add_assign_nonlocal(hf->derivative[ix][3], 2.*factor, v1);

    /***************** direction -3 ************************/

    iy=g_idn[ix][3];

    sm = k + iy;
    um=&hf->gaugefield[iy][3];
      
    _vector_i_sub(psia,(*sm).s0,(*sm).s2);
    _vector_i_add(psib,(*sm).s1,(*sm).s3);

    _vector_i_sub(phia,rr.s0,rr.s2);
    _vector_i_add(phib,rr.s1,rr.s3);

    _vector_tensor_vector_add(v1, psia, phia, psib, phib);
    _su3_times_su3d(v2,*um,v1);
    _complex_times_su3(v1,ka3,v2);
    _trace_lambda_mul_add_assign_nonlocal(hf->derivative[iy][3], 2.*factor, v1);
     
    /****************** end of loop ************************/
  }
#ifdef _KOJAK_INST
#pragma pomp inst end(derivSb)
#endif

#ifdef OMP
  } /*OpenMP closing brace */
#endif
}
예제 #23
0
/* 4. -IIG */
void xchange_halffield() {

#  ifdef TM_USE_MPI

  MPI_Request requests[16];
  MPI_Status status[16];
#  if ((defined PARALLELT) || (defined PARALLELX))
  int reqcount = 4;
#  elif ((defined PARALLELXT) || (defined PARALLELXY))
  int reqcount = 8;
#  elif ((defined PARALLELXYT) || (defined PARALLELXYZ))
  int reqcount = 12;
#  elif defined PARALLELXYZT
  int reqcount = 16;
#  endif
#  if (defined XLC && defined BGL)
  __alignx(16, HalfSpinor);
#  endif

#ifdef _KOJAK_INST
#pragma pomp inst begin(xchangehalf)
#endif

#    if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT )
  /* send the data to the neighbour on the right in t direction */
  /* recieve the data from the neighbour on the left in t direction */
  MPI_Isend((void*)(sendBuffer + g_HS_shift_t), LX*LY*LZ*12/2, MPI_DOUBLE, 
	    g_nb_t_up, 81, g_cart_grid, &requests[0]);
  MPI_Irecv((void*)(recvBuffer + g_HS_shift_t + LX*LY*LZ/2), LX*LY*LZ*12/2, MPI_DOUBLE, 
	    g_nb_t_dn, 81, g_cart_grid, &requests[1]);
  /* send the data to the neighbour on the left in t direction */
  /* recieve the data from the neighbour on the right in t direction */
  MPI_Isend((void*)(sendBuffer + g_HS_shift_t + LX*LY*LZ/2), LX*LY*LZ*12/2, MPI_DOUBLE, 
	    g_nb_t_dn, 82, g_cart_grid, &requests[2]);
  MPI_Irecv((void*)(recvBuffer + g_HS_shift_t), LX*LY*LZ*12/2, MPI_DOUBLE, 
	    g_nb_t_up, 82, g_cart_grid, &requests[3]);
#    endif
#    if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ )
  /* send the data to the neighbour on the right in x direction */
  /* recieve the data from the neighbour on the left in x direction */
  MPI_Isend((void*)(sendBuffer + g_HS_shift_x), T*LY*LZ*12/2, MPI_DOUBLE, 
	    g_nb_x_up, 91, g_cart_grid, &requests[4]);
  MPI_Irecv((void*)(recvBuffer + g_HS_shift_x + T*LY*LZ/2), T*LY*LZ*12/2, MPI_DOUBLE,
	    g_nb_x_dn, 91, g_cart_grid, &requests[5]);
  /* send the data to the neighbour on the left in x direction */
  /* recieve the data from the neighbour on the right in x direction */  
  MPI_Isend((void*)(sendBuffer + g_HS_shift_x + T*LY*LZ/2), T*LY*LZ*12/2, MPI_DOUBLE,
 	    g_nb_x_dn, 92, g_cart_grid, &requests[6]);
  MPI_Irecv((void*)(recvBuffer + g_HS_shift_x), T*LY*LZ*12/2, MPI_DOUBLE,
 	    g_nb_x_up, 92, g_cart_grid, &requests[7]);
#    endif
#    if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ )
    /* send the data to the neighbour on the right in y direction */
    /* recieve the data from the neighbour on the left in y direction */
  MPI_Isend((void*)(sendBuffer + g_HS_shift_y), T*LX*LZ*12/2, MPI_DOUBLE, 
	    g_nb_y_up, 101, g_cart_grid, &requests[8]);
  MPI_Irecv((void*)(recvBuffer + g_HS_shift_y + T*LX*LZ/2), T*LX*LZ*12/2, MPI_DOUBLE, 
	    g_nb_y_dn, 101, g_cart_grid, &requests[9]);
    /* send the data to the neighbour on the leftt in y direction */
    /* recieve the data from the neighbour on the right in y direction */
  MPI_Isend((void*)(sendBuffer + g_HS_shift_y + T*LX*LZ/2), T*LX*LZ*12/2, MPI_DOUBLE, 
	    g_nb_y_dn, 102, g_cart_grid, &requests[10]);
  MPI_Irecv((void*)(recvBuffer + g_HS_shift_y), T*LX*LZ*12/2, MPI_DOUBLE, 
	    g_nb_y_up, 102, g_cart_grid, &requests[11]);
#    endif
#    if (defined PARALLELXYZT || defined PARALLELXYZ )
  /* send the data to the neighbour on the right in z direction */
  /* recieve the data from the neighbour on the left in z direction */
  MPI_Isend((void*)(sendBuffer + g_HS_shift_z), T*LX*LY*12/2, MPI_DOUBLE, 
		g_nb_z_up, 503, g_cart_grid, &requests[12]);
  MPI_Irecv((void*)(recvBuffer + g_HS_shift_z + T*LX*LY/2), T*LX*LY*12/2, MPI_DOUBLE, 
		g_nb_z_dn, 503, g_cart_grid, &requests[13]); 
  /* send the data to the neighbour on the left in z direction */
  /* recieve the data from the neighbour on the right in z direction */
  MPI_Isend((void*)(sendBuffer + g_HS_shift_z + T*LX*LY/2), 12*T*LX*LY/2, MPI_DOUBLE, 
		g_nb_z_dn, 504, g_cart_grid, &requests[14]);
  MPI_Irecv((void*)(recvBuffer + g_HS_shift_z), T*LX*LY*12/2, MPI_DOUBLE, 
		g_nb_z_up, 504, g_cart_grid, &requests[15]); 
#    endif

  MPI_Waitall(reqcount, requests, status); 
#  endif /* MPI */
  return;

#ifdef _KOJAK_INST
#pragma pomp inst end(xchangehalf)
#endif
}