C++ (Cpp) sys_cacheflush 예제들

예제 #1

0

파일 보기

파일: shift_util.C 프로젝트: DeanHowarth/QUDA-CPS

CPS_START_NAMESPACE
void GlobalDataShift::Shift(int mu, int n_disp){
  if (n_disp==0) return;
 
  SCUDir s_dir,r_dir;
  int a_disp;
  void *send_p,*recv_p,*temp_p;
  if (n_disp>0){
   a_disp = n_disp;
   s_dir = gjp_scu_dir[mu*2];
   r_dir = gjp_scu_dir[mu*2+1];
  } else {
   a_disp = -n_disp;
   s_dir = gjp_scu_dir[mu*2+1];
   r_dir = gjp_scu_dir[mu*2];
  }

  send_p = addr;
  recv_p = temp_buf;

  SCUDirArgIR Send(send_p,s_dir,SCU_SEND,data_len);
  SCUDirArgIR Recv(recv_p,r_dir,SCU_REC,data_len);

  sys_cacheflush(0);
  for(int i = 0;i<a_disp-1;i++){
    Send.StartTrans();Recv.StartTrans();
    Send.TransComplete();Recv.TransComplete();
    temp_p = send_p;
    send_p = recv_p;
    recv_p = temp_p;
    Send.Addr(send_p);
    Recv.Addr(recv_p);
  }
  Send.StartTrans();Recv.StartTrans();
  Send.TransComplete();Recv.TransComplete();

  if (recv_p != addr)
    memcpy(addr,recv_p,data_len);
}

예제 #2

0

파일 보기

파일: warm.cpp 프로젝트: Bernieboy/nds4ios

int warm_cache_op_range(int op, void *addr, unsigned long size)
{
	struct warm_cache_op wop;
	int ret;

	if (warm_fd < 0) {
		/* note that this won't work for warm_cache_op_all */
		sys_cacheflush(addr, (char *)addr + size);
		return -1;
	}

	wop.ops = op;
	wop.addr = (unsigned long)addr;
	wop.size = size;

	ret = ioctl(warm_fd, WARMC_CACHE_OP, &wop);
	if (ret != 0) {
		perror("WARMC_CACHE_OP failed");
		return -1;
	}

	return 0;
}

예제 #3

0

파일 보기

파일: dirac.C 프로젝트: DeanHowarth/QUDA-CPS

void stag_dirac(Vector* b_v, Vector* a_v, int a_odd, int add_flag, int dir_flag)
{
  IFloat* b = (IFloat *)b_v;
  IFloat* a = (IFloat *)a_v;

  int c =0;
  int odd = 1 - a_odd;
  //-----------------------------------------------------------------
  //  Transfer chi's on faces.  
  //-----------------------------------------------------------------

  copy_buffer(countM[a_odd], (long)a, (long)Tbuffer[0], (long)ToffsetM[a_odd]);
  copy_buffer(countP[a_odd], (long)a, (long)Tbuffer[1], (long)ToffsetP[a_odd]);

  //make sure spinor field is in main memory before starting transfers
//  flush_cache_spinor(nflush, (long)a);

  SCUarg[1]->Addr( a + Xoffset[1]);
  SCUarg[5]->Addr( a + Xoffset[5]);
  SCUarg[2]->Addr( a + Xoffset[2]);
  SCUarg[6]->Addr( a + Xoffset[6]);
  SCUarg[3]->Addr( a + Xoffset[3]);
  SCUarg[7]->Addr( a + Xoffset[7]);
//    printf("Setting offsets done\n");
  
  sys_cacheflush(0);
//  for(int i = 0;i<2*NUM_DIR;i++) SCUarg[i]->Print();
//  exit(-34);
  for(int i = 0;i<2*NUM_DIR;i++) SCUarg[i]->Assert();
  SCUmulti->SlowStartTrans();
//  printf("SCUmulti started\n");
//  save_reg((long)intreg, (long)dreg);
  //-----------------------------------------------------------------
  //do first local computations
  //-----------------------------------------------------------------


#ifdef CPP
  dirac_cmv_l( local_chi/2, (long)chi_l[odd], (long)uc_l[odd], 
	       (long)a, (long)tmpfrm);
#else
  dirac_cmv( local_chi/2, (long)chi_l[odd], (long)uc_l[odd], 
	       (long)a, (long)tmpfrm);
#endif


  //-----------------------------------------------------------------
  // check to see if transfers are done
  //-----------------------------------------------------------------
//  for(int i = 0;i<2*NUM_DIR;i++) SCUarg[i]->Assert();
  SCUmulti->TransComplete();
//  printf("SCUmulti ended\n");

  //-----------------------------------------------------------------
  //do the computations involving non-local spinors
  //-----------------------------------------------------------------

#if 1
  dirac_cmv( non_local_chi/2, (long)chi_nl[odd], (long)uc_nl[odd], 
		(long)c, (long)tmpfrm);
#else
  dirac_cmv_nl( non_local_chi/2, (long)chi_nl[odd], (long)uc_nl[odd], 
		(long)c, (long)tmpfrm);
#endif


  //-----------------------------------------------------------------
  //do the sum of 8 temporary vectors at each lattice site
  //-----------------------------------------------------------------

  if ( add_flag == 0){
    dirac_sum( vol/2, (long)chi[odd], (long)tmpfrm, (long)b);
  }
  else{
    dirac_sum_acc( vol/2, (long)chi[odd], (long)tmpfrm, (long)b);
  }

  DiracOp::CGflops += 285*vol;


}

예제 #4

0

파일 보기

파일: f_dwf_base_force.C 프로젝트: DeanHowarth/QUDA-CPS

// CJ: change start
//------------------------------------------------------------------
// EvolveMomFforce(Matrix *mom, Vector *chi, Float mass, 
//                 Float dt):
// It evolves the canonical momentum mom by dt
// using the fermion force.
//------------------------------------------------------------------
ForceArg FdwfBase::EvolveMomFforce(Matrix *mom, Vector *chi, 
			   Float mass, Float dt){
  char *fname = "EvolveMomFforce(M*,V*,F,F,F)";
  VRB.Func(cname,fname);
  Matrix *gauge = GaugeField() ;

  if (Colors() != 3)
    ERR.General(cname,fname,"Wrong nbr of colors.") ;
 
  if (SpinComponents() != 4)
    ERR.General(cname,fname,"Wrong nbr of spin comp.") ;
 
  if (mom == 0)
    ERR.Pointer(cname,fname,"mom") ;
 
  if (chi == 0)
    ERR.Pointer(cname,fname,"chi") ;
 
  //----------------------------------------------------------------
  // allocate space for two CANONICAL fermion fields
  //----------------------------------------------------------------

  int f_size = FsiteSize() * GJP.VolNodeSites() ;
  int f_site_size_4d = 2 * Colors() * SpinComponents();
  int f_size_4d = f_site_size_4d * GJP.VolNodeSites() ;
 
  char *str_v1 = "v1" ;
  Vector *v1 = (Vector *)fmalloc(cname,fname,str_v1,f_size*sizeof(Float));
//  if (v1 == 0) ERR.Pointer(cname, fname, str_v1) ;
//  VRB.Smalloc(cname, fname, str_v1, v1, f_size*sizeof(Float)) ;

  char *str_v2 = "v2" ;
  Vector *v2 = (Vector *)fmalloc(cname,fname,str_v2,f_size*sizeof(Float)) ;
//  if (v2 == 0) ERR.Pointer(cname, fname, str_v2) ;
//  VRB.Smalloc(cname, fname, str_v2, v2, f_size*sizeof(Float)) ;

//  LatMatrix MomDiff(QFAST,4);
//  Matrix *mom_diff = MomDiff.Mat();

  Float L1 = 0.0;
  Float L2 = 0.0;
  Float Linf = 0.0;

  //----------------------------------------------------------------
  // Calculate v1, v2. Both v1, v2 must be in CANONICAL order after
  // the calculation.
  //----------------------------------------------------------------  

  VRB.Clock(cname, fname, "Before calc force vecs.\n") ;

#ifdef PROFILE
  Float time = -dclock();
  ForceFlops=0;
#endif
  {
    CgArg cg_arg ;
    cg_arg.mass = mass ;

    DiracOpDwf dwf(*this, v1, v2, &cg_arg, CNV_FRM_NO) ;
    dwf.CalcHmdForceVecs(chi) ;
    Fconvert(v1,CANONICAL,WILSON);
    Fconvert(v2,CANONICAL,WILSON);
  }
#ifdef PROFILE
  time += dclock();
  print_flops(cname,fname,ForceFlops,time);
#endif

  int mu, x, y, z, t, s, lx, ly, lz, lt, ls ;
  int size[5],surf[4],blklen[4],stride[4],numblk[4];
 
  size[0] = lx = GJP.XnodeSites() ;
  size[1] = ly = GJP.YnodeSites() ;
  size[2] = lz = GJP.ZnodeSites() ;
  size[3] = lt = GJP.TnodeSites() ;
  size[4] = ls = GJP.SnodeSites() ;

  blklen[0] = sizeof(Float)*FsiteSize()/size[4];
  numblk[0] = GJP.VolNodeSites()/size[0]*size[4];
  stride[0] = blklen[0] * (size[0]-1);
  for (int i =1;i<4;i++){
    blklen[i] = blklen[i-1] * size[i-1];
	numblk[i] = numblk[i-1] / size[i];
    stride[i] = blklen[i] * (size[i]-1);
  }
  for (int i =0;i<4;i++){
    surf[i] = GJP.VolNodeSites()/size[i];
  }

  //----------------------------------------------------------------
  // allocate buffer space for two fermion fields that are assoc
  // with only one 4-D site.
  //----------------------------------------------------------------

  unsigned long flag = QFAST;
  if (ls<4) flag|= QNONCACHE;
  char *str_site_v1 = "site_v1" ;
  char *str_site_v2 = "site_v2" ;

  Float *v1_buf[4],*v2_buf[4];
  int pos[4];
  Float *v1_buf_pos[4];
  Float *v2_buf_pos[4];
  for(int i =0;i<4;i++) {
    v1_buf[i]=(Float *)fmalloc(cname,fname,"v1_buf",surf[i]*FsiteSize()*sizeof(Float)) ;
    v2_buf[i]=(Float *)fmalloc(cname,fname,"v2_buf",surf[i]*FsiteSize()*sizeof(Float)) ;
  }

//  Matrix tmp_mat1, tmp_mat2 ;
  Matrix *tmp_mat1,*tmp_mat2;
  tmp_mat1 = (Matrix *)fmalloc(cname,fname,"tmp_mat1",sizeof(Matrix)*2);
  tmp_mat2 = tmp_mat1+1;

  SCUDirArgIR Send[4];
  SCUDirArgIR Recv[4];
  SCUDMAInst *dma[2];
  for(int i = 0;i<2;i++) dma[i] = new SCUDMAInst;
  void *addr[2];
  int f_bytes = sizeof(Float)*f_site_size_4d;
  int st_bytes = sizeof(Float)*f_size_4d - f_bytes;
  for (mu=0; mu<4; mu++){
    if ( GJP.Nodes(mu) >1){
      dma[0]->Init(v1_buf[mu],surf[mu]*ls*f_bytes);
      dma[1]->Init(v2_buf[mu],surf[mu]*ls*f_bytes);
      Recv[mu].Init(gjp_scu_dir[2*mu],SCU_REC,dma,2);
      dma[0]->Init(v1,blklen[mu],numblk[mu],stride[mu]);
      dma[1]->Init(v2,blklen[mu],numblk[mu],stride[mu]);
      Send[mu].Init(gjp_scu_dir[2*mu+1],SCU_SEND,dma,2);
    }
  }
  for(int i = 0;i<2;i++) delete dma[i];

  VRB.Clock(cname, fname, "Before loop over links.\n") ;

#ifdef PROFILE
  time = -dclock();
  ForceFlops=0;
#endif

  sys_cacheflush(0);
  for (mu=0; mu<4; mu++){
    if ( GJP.Nodes(mu) >1){
      Recv[mu].StartTrans();
      Send[mu].StartTrans();
    }
  }
  for (mu=0; mu<4; mu++){
    for (pos[3]=0; pos[3]<size[3]; pos[3]++){
    for (pos[2]=0; pos[2]<size[2]; pos[2]++){
    for (pos[1]=0; pos[1]<size[1]; pos[1]++){
    for (pos[0]=0; pos[0]<size[0]; pos[0]++){
      int gauge_offset = offset(size,pos);
      int vec_offset = f_site_size_4d*gauge_offset ;
      gauge_offset = mu+4*gauge_offset ;

//      printf("%d %d %d %d %d\n",mu,pos[0],pos[1],pos[2],pos[3]);
      Float *v1_plus_mu ;
      Float *v2_plus_mu ;
      int vec_plus_mu_stride ;
      int vec_plus_mu_offset = f_site_size_4d ;

      Float coeff = -2.0 * dt ;

          vec_plus_mu_offset *= offset(size,pos,mu);
      if ((GJP.Nodes(mu)>1)&&((pos[mu]+1) == size[mu]) ) {
      } else {
            v1_plus_mu = (Float *)v1+vec_plus_mu_offset ;
            v2_plus_mu = (Float *)v2+vec_plus_mu_offset ;
            vec_plus_mu_stride = f_size_4d - f_site_size_4d ;
            if ((pos[mu]+1) == size[mu])
            if (GJP.NodeBc(mu)==BND_CND_APRD) coeff = -coeff ;

//     Float time2 = -dclock();
      sproj_tr[mu]( (IFloat *)tmp_mat1,
                    (IFloat *)v1_plus_mu,
                    (IFloat *)v2+vec_offset,
                    ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ;

      sproj_tr[mu+4]( (IFloat *)tmp_mat2,
                      (IFloat *)v2_plus_mu,
                      (IFloat *)v1+vec_offset,
                      ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ;
 //    time2 += dclock();
 //    print_flops("","sproj",2*9*16*ls,time2);

      *tmp_mat1 += *tmp_mat2 ;

      // If GJP.Snodes > 1 sum up contributions from all s nodes
      if(GJP.Snodes() > 1) {
	  glb_sum_multi_dir((Float *)tmp_mat1, 4, sizeof(Matrix)/sizeof(IFloat) ) ;
      }

      tmp_mat2->DotMEqual(*(gauge+gauge_offset), *tmp_mat1) ;

      tmp_mat1->Dagger(*tmp_mat2) ;

      tmp_mat2->TrLessAntiHermMatrix(*tmp_mat1) ;

      *tmp_mat2 *= coeff ;

      *(mom+gauge_offset) += *tmp_mat2 ;

      Float norm = tmp_mat2->norm();
      Float tmp = sqrt(norm);
      L1 += tmp;
      L2 += norm;
      Linf = (tmp>Linf ? tmp : Linf);
    }

    } } } } // end for x,y,z,t
  } // end for mu
  for (mu=0; mu<4; mu++){
    if ( GJP.Nodes(mu) >1){
      Recv[mu].TransComplete();
      Send[mu].TransComplete();
    }
  }
 
//------------------------------------------------------------------
// start by summing first over direction (mu) and then over site
// to allow SCU transfers to happen face-by-face in the outermost
// loop.
//------------------------------------------------------------------

  for(int i = 0;i<4;i++){
    v1_buf_pos[i] = v1_buf[i];
    v2_buf_pos[i] = v2_buf[i];
  }

  for (mu=0; mu<4; mu++){
    for (pos[3]=0; pos[3]<size[3]; pos[3]++){
    for (pos[2]=0; pos[2]<size[2]; pos[2]++){
    for (pos[1]=0; pos[1]<size[1]; pos[1]++){
    for (pos[0]=0; pos[0]<size[0]; pos[0]++){
      int gauge_offset = offset(size,pos);
      int vec_offset = f_site_size_4d*gauge_offset ;
      gauge_offset = mu+4*gauge_offset ;

      Float *v1_plus_mu ;
      Float *v2_plus_mu ;
      int vec_plus_mu_stride ;
      int vec_plus_mu_offset = f_site_size_4d ;

      Float coeff = -2.0 * dt ;

//      printf("%d %d %d %d %d\n",mu,pos[0],pos[1],pos[2],pos[3]);
          vec_plus_mu_offset *= offset(size,pos,mu);
      if ((GJP.Nodes(mu)>1)&&((pos[mu]+1) == size[mu]) ) {
            v1_plus_mu = v1_buf_pos[mu] ;
            v2_plus_mu = v2_buf_pos[mu] ;
 			v1_buf_pos[mu] += f_site_size_4d;
 			v2_buf_pos[mu] += f_site_size_4d;
            vec_plus_mu_stride = (surf[mu] -1)*f_site_size_4d ;
            if (GJP.NodeBc(mu)==BND_CND_APRD) coeff = -coeff ;

//     Float time2 = -dclock();
      sproj_tr[mu]( (IFloat *)tmp_mat1,
                    (IFloat *)v1_plus_mu,
                    (IFloat *)v2+vec_offset,
                    ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ;

      sproj_tr[mu+4]( (IFloat *)tmp_mat2,
                      (IFloat *)v2_plus_mu,
                      (IFloat *)v1+vec_offset,
                      ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ;
 //    time2 += dclock();
 //    print_flops("","sproj",2*9*16*ls,time2);

      *tmp_mat1 += *tmp_mat2 ;

      // If GJP.Snodes > 1 sum up contributions from all s nodes
      if(GJP.Snodes() >1 ) {
	  glb_sum_multi_dir((Float *)tmp_mat1, 4, sizeof(Matrix)/sizeof(IFloat) ) ;
      }

      tmp_mat2->DotMEqual(*(gauge+gauge_offset), *tmp_mat1) ;

      tmp_mat1->Dagger(*tmp_mat2) ;

      tmp_mat2->TrLessAntiHermMatrix(*tmp_mat1) ;

      *tmp_mat2 *= coeff ;

      *(mom+gauge_offset) += *tmp_mat2 ;
      Float norm = tmp_mat2->norm();
      Float tmp = sqrt(norm);
      L1 += tmp;
      L2 += norm;
      Linf = (tmp>Linf ? tmp : Linf);
    }

    } } } } // end for x,y,z,t
  } // end for mu

  ForceFlops += (2*9*16*ls + 18+ 198+36+24)*lx*ly*lz*lt*4;
#ifdef PROFILE
  time += dclock();
  print_flops(cname,fname,ForceFlops,time);
#endif
 
//------------------------------------------------------------------
// deallocate smalloc'd space
//------------------------------------------------------------------

  for(int i =0;i<4;i++) {
    ffree(v1_buf[i],cname,fname,"v1_buf");
    ffree(v2_buf[i],cname,fname,"v2_buf");
  }
 
  VRB.Sfree(cname, fname, str_v2, v2) ;
  ffree(v2) ;
 
  VRB.Sfree(cname, fname, str_v1, v1) ;
  ffree(v1) ;

  ffree(tmp_mat1);
 
  glb_sum(&L1);
  glb_sum(&L2);
  glb_max(&Linf);

  L1 /= 4.0*GJP.VolSites();
  L2 /= 4.0*GJP.VolSites();

  return ForceArg(L1, sqrt(L2), Linf);
}