Пример #1
0
 cpuCloverField::~cpuCloverField() { 
   if (create != QUDA_REFERENCE_FIELD_CREATE) {
     if (clover) host_free(clover);
     if (norm) host_free(norm);
     if (cloverInv) host_free(cloverInv);
     if (invNorm) host_free(invNorm);      
   }
 }
Пример #2
0
static int 
getNumaAffinity(int my_gpu, int *cpu_cores, int* ncores)
{
  FILE *nvidia_info, *pci_bus_info;
  size_t nbytes = 255;
  char *my_line;
  char nvidia_info_path[255], pci_bus_info_path[255];
  char bus_info[255];
  
  // the nvidia driver populates this path for each gpu
  sprintf(nvidia_info_path,"/proc/driver/nvidia/gpus/%d/information", my_gpu);
  nvidia_info= fopen(nvidia_info_path,"r");
  if (nvidia_info == NULL){
    return -1;
  }
  
  my_line= (char *) safe_malloc(nbytes +1);
  
  while (!feof(nvidia_info)){
    if ( -1 == getline(&my_line, &nbytes, nvidia_info)){
      break;
    }else{
      // the first 7 char of the Bus Location will lead to the corresponding
      // path under /sys/class/pci_bus/  , cpulistaffinity showing cores on that
      // bus is located there
      if ( 1 == sscanf(my_line,"Bus Location: %s", bus_info )){
	sprintf(pci_bus_info_path,"/sys/class/pci_bus/%.7s/cpulistaffinity",
		bus_info);
      }
    }
  }
  // open the cpulistaffinity file on the pci_bus for "my_gpu"
  pci_bus_info= fopen(pci_bus_info_path,"r");
  if (pci_bus_info == NULL){
    //printfQuda("Warning: opening file %s failed\n", pci_bus_info_path);
    host_free(my_line);
    fclose(nvidia_info);
    return -1;
  }
  
  while (!feof(pci_bus_info)){
    if ( -1 == getline(&my_line, &nbytes, pci_bus_info)){
      break;
    } else{
      int rc = process_core_string_list(my_line, cpu_cores, ncores);
      if(rc < 0){
	warningQuda("Failed to process the line \"%s\"", my_line);
	host_free(my_line);
	fclose(nvidia_info);
	return  -1;
      }
    }
  }
  
  host_free(my_line);
  return 0;
}
Пример #3
0
  void cpuColorSpinorField::freeGhostBuffer(void)
  {
    if(!initGhostFaceBuffer) return;

    for(int i=0;i < 4; i++){
      host_free(fwdGhostFaceBuffer[i]); fwdGhostFaceBuffer[i] = NULL;
      host_free(backGhostFaceBuffer[i]); backGhostFaceBuffer[i] = NULL;
      host_free(fwdGhostFaceSendBuffer[i]); fwdGhostFaceSendBuffer[i] = NULL;
      host_free(backGhostFaceSendBuffer[i]);  backGhostFaceSendBuffer[i] = NULL;
    } 
    initGhostFaceBuffer = 0;
  }
Пример #4
0
  void cpuColorSpinorField::destroy() {
  
    if (init) {
      if (fieldOrder == QUDA_QOP_DOMAIN_WALL_FIELD_ORDER) 
	for (int i=0; i<x[nDim-1]; i++) host_free(((void**)v)[i]);
      host_free(v);
      init = false;
    }

    if (siteSubset == QUDA_FULL_SITE_SUBSET) {
      if (even) delete even;
      if (odd) delete odd;
    }

  }
Пример #5
0
void static loadParityHw(ParityHw ret, void *hw, QudaPrecision cpu_prec)
{
    if (ret.precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) {
        errorQuda("CUDA double precision requires CPU double precision");
    }

    if (ret.precision != QUDA_HALF_PRECISION) {

        void *packedHw1 = pinned_malloc(ret.bytes);

        if (ret.precision == QUDA_DOUBLE_PRECISION) {
            packParityHw((double2*)packedHw1, (double*)hw, ret.volume);
        } else {
            if (cpu_prec == QUDA_DOUBLE_PRECISION) {
                packParityHw((float2*)packedHw1, (double*)hw, ret.volume);
            } else {
                packParityHw((float2*)packedHw1, (float*)hw, ret.volume);
            }
        }
        cudaMemcpy(ret.data, packedHw1, ret.bytes, cudaMemcpyHostToDevice);
        host_free(packedHw1);

    } else {
        //half precision
        /*
          ParityHw tmp = allocateParityHw(ret.X, QUDA_SINGLE_PRECISION);
          loadParityHw(tmp, hw, cpu_prec, dirac_order);
          copyCuda(ret, tmp);
          freeParityHw(tmp);
        */
    }
}
Пример #6
0
static void retrieveParityHw(void *res, ParityHw hw, QudaPrecision cpu_prec)
{
    if (hw.precision != QUDA_HALF_PRECISION) {

        void *packedHw1 = pinned_malloc(hw.bytes);
        cudaMemcpy(packedHw1, hw.data, hw.bytes, cudaMemcpyDeviceToHost);

        if (hw.precision == QUDA_DOUBLE_PRECISION) {
            unpackParityHw((double*)res, (double2*)packedHw1, hw.volume);
        } else {
            if (cpu_prec == QUDA_DOUBLE_PRECISION) {
                unpackParityHw((double*)res, (float2*)packedHw1, hw.volume);
            } else {
                unpackParityHw((float*)res, (float2*)packedHw1, hw.volume);
            }
        }
        host_free(packedHw1);

    } else {
        //half precision
        /*
          ParityHw tmp = allocateParityHw(hw.X, QUDA_SINGLE_PRECISION);
          copyCuda(tmp, hw);
          retrieveParityHw(res, tmp, cpu_prec, dirac_order);
          freeParityHw(tmp);
        */
    }
}
Пример #7
0
  void cpuColorSpinorField::destroy() {
  
    if (precision == QUDA_DOUBLE_PRECISION) {
      delete order_double;
    } else if (precision == QUDA_SINGLE_PRECISION) {
      delete order_single;
    } else {
      errorQuda("Precision %d not supported", precision);
    }
  
    if (init) {
      if (fieldOrder == QUDA_QOP_DOMAIN_WALL_FIELD_ORDER) 
	for (int i=0; i<x[nDim-1]; i++) host_free(((void**)v)[i]);
      host_free(v);
      init = false;
    }

  }
Пример #8
0
void 
comm_init(void)
{
  int i;
  
  static int firsttime=1;
  if (!firsttime) return;
  firsttime = 0;

  gethostname(hostname, 128);
  hostname[127] = '\0';

  MPI_Comm_size(MPI_COMM_WORLD, &size);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);

  int gpus_per_node = getGpuCount();  

  comm_partition();

  back_nbr = (rank -1 + size)%size;
  fwd_nbr = (rank +1)%size;
  num_nodes=size / getGpuCount();
  if(num_nodes ==0) {
	num_nodes=1;
  }

  //determine which gpu this MPI process is going to use
  char* hostname_recv_buf = (char*)safe_malloc(128*size);
  
  int rc = MPI_Allgather(hostname, 128, MPI_CHAR, hostname_recv_buf, 128, MPI_CHAR, MPI_COMM_WORLD);
  if (rc != MPI_SUCCESS){
    printf("ERROR: MPI_Allgather failed for hostname\n");
    comm_exit(1);
  }

  which_gpu=0;
  for(i=0;i < size; i++){
    if (i == rank){
      break;
    }
    if (strncmp(hostname, hostname_recv_buf + 128*i, 128) == 0){
      which_gpu ++;
    }
  }
  
  if (which_gpu >= gpus_per_node){
    printf("ERROR: invalid gpu(%d) to use in rank=%d mpi process\n", which_gpu, rank);
    comm_exit(1);
  }
  
  srand(rank*999);
  
  host_free(hostname_recv_buf);
  return;
}
Пример #9
0
  cpuGaugeField::~cpuGaugeField()
  {
    if (create == QUDA_NULL_FIELD_CREATE || create == QUDA_ZERO_FIELD_CREATE) {
      if (order == QUDA_QDP_GAUGE_ORDER) {
	for (int d=0; d<nDim; d++) {
	  if (gauge[d]) host_free(gauge[d]);
	}
	if (gauge) host_free(gauge);
      } else {
	if (gauge) host_free(gauge);
      }
    } else { // QUDA_REFERENCE_FIELD_CREATE 
      if (order == QUDA_QDP_GAUGE_ORDER){
	if (gauge) host_free(gauge);
      }
    }
  
    if (link_type != QUDA_ASQTAD_MOM_LINKS) {
      for (int i=0; i<nDim; i++) {
	if (ghost[i]) host_free(ghost[i]);
      }
    }
  }
Пример #10
0
/* Deallocate all data and structues for given data instance. */
LOCAL void delete_NIDDB IFN1
   (
   int, record_id	/* (I ) Record ID (ie virtualising byte value) */
   )
   {
   int i;
   IHP *instance_ptr;

   /* Ensure instance is active (for terminate callback). */
   swap_NIDDB(record_id);

   instance_ptr = vrecs[record_id].vr_pinst_tbl;

   /* Deallocate all data areas in instance_ptr table. */
   for (i = 0; i < MAX_INSTANCES; i++, instance_ptr++)
      {
      if ( *instance_ptr != (IHP)0 )
	 {
	 /* Action any terminate callback */
	 if ( terminate_callback[i] != (NIDDB_TM_CALLBACK)0 )
	    {
	    (terminate_callback[i])();
	    }

	 /* free up memory */
	 host_free(*instance_ptr);
	 }
      }

   /* Deallocate the instance_ptr table itself */
   host_free((IHP)vrecs[record_id].vr_pinst_tbl);

   /* Initialise virtual record entry */
   vrecs[record_id].vr_pinst_tbl = (IHP *)0;
   vrecs[record_id].vr_inst_handle = (IU32)0;
   }
Пример #11
0
void comm_init(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data)
{
  int initialized;
  MPI_CHECK( MPI_Initialized(&initialized) );

  if (!initialized) {
    errorQuda("MPI has not been initialized");
  }

  MPI_CHECK( MPI_Comm_rank(MPI_COMM_WORLD, &rank) );
  MPI_CHECK( MPI_Comm_size(MPI_COMM_WORLD, &size) );

  int grid_size = 1;
  for (int i = 0; i < ndim; i++) {
    grid_size *= dims[i];
  }
  if (grid_size != size) {
    errorQuda("Communication grid size declared via initCommsGridQuda() does not match"
              " total number of MPI ranks (%d != %d)", grid_size, size);
  }

  Topology *topo = comm_create_topology(ndim, dims, rank_from_coords, map_data);
  comm_set_default_topology(topo);

  // determine which GPU this MPI rank will use
  char *hostname = comm_hostname();
  char *hostname_recv_buf = (char *)safe_malloc(128*size);
  
  MPI_CHECK( MPI_Allgather(hostname, 128, MPI_CHAR, hostname_recv_buf, 128, MPI_CHAR, MPI_COMM_WORLD) );

  gpuid = 0;
  for (int i = 0; i < rank; i++) {
    if (!strncmp(hostname, &hostname_recv_buf[128*i], 128)) {
      gpuid++;
    }
  }
  host_free(hostname_recv_buf);

  int device_count;
  cudaGetDeviceCount(&device_count);
  if (device_count == 0) {
    errorQuda("No CUDA devices found");
  }
  if (gpuid >= device_count) {
    errorQuda("Too few GPUs available on %s", hostname);
  }
}
Пример #12
0
  // This does the exchange of the gauge field ghost zone and places it
  // into the ghost array.
  void cpuGaugeField::exchangeGhost() {
    if (ghostExchange) return;

    void *send[QUDA_MAX_DIM];
    for (int d=0; d<nDim; d++) send[d] = safe_malloc(nFace*surface[d]*reconstruct*precision);

    // get the links into contiguous buffers
    extractGaugeGhost(*this, send);

    // communicate between nodes
    FaceBuffer faceBuf(x, nDim, reconstruct, nFace, precision);
    faceBuf.exchangeLink(ghost, send, QUDA_CPU_FIELD_LOCATION);

    for (int d=0; d<nDim; d++) host_free(send[d]);

    ghostExchange = true;
  }
Пример #13
0
void
host_load(void)
{
    FILE *f;
    int ip;
    char name[80];
    char *p = name;

    if (!(f = fopen("nethost.cache", "r")))
        return;

    if (hostcache_list)
        host_free();

    while (fscanf(f, "%x %s\n", &ip, p) == 2)
        host_add(ip, name);

    fclose(f);
}
Пример #14
0
status_t CameraAcc::acc_read_fw(const char* filename, fw_info &fw)
{
    LOG1("@%s", __FUNCTION__);

    FILE* file;
    if (!filename)
        return UNKNOWN_ERROR;

    LOG1("filename=%s", filename);
    fw.size = 0;
    fw.data = NULL;

    file = fopen(filename, "rb");
    if (!file)
        return UNKNOWN_ERROR;

    fseek(file, 0, SEEK_END);
    fw.size = ftell(file);
    fseek(file, 0, SEEK_SET);

    if (fw.size == 0) {
        fclose(file);
        return UNKNOWN_ERROR;
    }

    fw.data = host_alloc(fw.size);
    if (fw.data == NULL) {
        fclose(file);
        return UNKNOWN_ERROR;
    }

    if (fread(fw.data, 1, fw.size, file) != fw.size) {
        fclose(file);
        host_free(fw.data);
        return UNKNOWN_ERROR;
    }

    fclose(file);

    return NO_ERROR;
}
Пример #15
0
  void cpuColorSpinorField::exchangeGhost(QudaParity parity, int nFace, int dagger, const MemoryLocation *dummy1,
					  const MemoryLocation *dummy2, bool dummy3, bool dummy4) const
  {
    // allocate ghost buffer if not yet allocated
    allocateGhostBuffer(nFace);

    void **sendbuf = static_cast<void**>(safe_malloc(nDimComms * 2 * sizeof(void*)));

    for (int i=0; i<nDimComms; i++) {
      sendbuf[2*i + 0] = backGhostFaceSendBuffer[i];
      sendbuf[2*i + 1] = fwdGhostFaceSendBuffer[i];
      ghost_buf[2*i + 0] = backGhostFaceBuffer[i];
      ghost_buf[2*i + 1] = fwdGhostFaceBuffer[i];
    }

    packGhost(sendbuf, parity, nFace, dagger);

    exchange(ghost_buf, sendbuf, nFace);

    host_free(sendbuf);
  }
Пример #16
0
/* Deallocate per Virtual Machine data area for Device Driver. */
GLOBAL void NIDDB_Deallocate_Instance_Data IFN1
   (
   IHP *, handle	/* Handle to data area */
   )
   {
   int i;

   if ( !allocation_allowed )
      {
      /*
	 We are still managing instances for Windows, or at least we think we 
	 are. Has the user escaped from Windows without our VxD being informed,
	 or is the Insignia Device Driver giving us a bum steer?
       */
      always_trace0("NIDDB: Unexpected call to NIDDB_Deallocate_Instance_Data.");

      /* We might give the Insignia Device Driver the benefit of the doubt and
	 act like a System_Exit message. - Then again we might just ignore em. */
      return;
      }

   /* Find index to master_ptrs, etc. */
   i = handle - &master_ptrs[0];
   if ( i < 0 || i >= MAX_INSTANCES )
      {
      always_trace0("NIDDB: Bad handle passed to NIDDB_Deallocate_Instance_Data.");
      return;
      }

   /* Free data area */
   host_free(master_ptrs[i]);

   /* Initialise entry */
   master_ptrs[i] = snapshot_ptrs[i] = (IHP)0;
   instance_size[i] = (IU32)0;
   create_callback[i] = (NIDDB_CR_CALLBACK)0;
   terminate_callback[i] = (NIDDB_TM_CALLBACK)0;

   return;
   }
Пример #17
0
/*
 * UMA backend page allocator for the jumbo frame zones.
 *
 * Allocates kernel virtual memory that is backed by contiguous physical
 * pages.
 */
static void *
mbuf_jumbo_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
{

	/* Inform UMA that this allocator uses kernel_map/object. */
	void *p;	/* Returned page */
	struct vm_page_list * page_list;
	vm_page_t     pages;
	unsigned long  size, page_num, pfn;
	int i;

	size= round_page(bytes);
	page_num = size >> PAGE_SHIFT;

	*flags = UMA_SLAB_FREEBSD_KERNEL;
	p = (void *)host_malloc(size, PAGE_SIZE); // kmem_malloc(kmem_map, bytes, wait);
	pfn = ((unsigned long)p) >> PAGE_SHIFT;

	if(p != NULL){
		pages = (vm_page_t)host_malloc(sizeof(struct vm_page) * page_num, -1);
		if(pages != NULL){
			for(i = 0; i < page_num; i ++, pfn ++){
				page_list = &page_slab_hash[pfn % MAX_UPTCP_PAGENUM];
				pages[i].page_addr = (uint8_t*)(pfn << PAGE_SHIFT);
				pages[i].flags = 0;
				pages[i].object= NULL;
				SLIST_INSERT_HEAD(page_list, &pages[i], page_link);
			}
		} else {
			host_free(p);
			return NULL;
		}
	}

	return (p);
	
}
Пример #18
0
static void
gauge_force_test(void) 
{
  int max_length = 6;    
  
  initQuda(device);
  setVerbosityQuda(QUDA_VERBOSE,"",stdout);

  qudaGaugeParam = newQudaGaugeParam();
  
  qudaGaugeParam.X[0] = xdim;
  qudaGaugeParam.X[1] = ydim;
  qudaGaugeParam.X[2] = zdim;
  qudaGaugeParam.X[3] = tdim;
  
  setDims(qudaGaugeParam.X);
  
  qudaGaugeParam.anisotropy = 1.0;
  qudaGaugeParam.cpu_prec = link_prec;
  qudaGaugeParam.cuda_prec = link_prec;
  qudaGaugeParam.cuda_prec_sloppy = link_prec;
  qudaGaugeParam.reconstruct = link_recon;  
  qudaGaugeParam.reconstruct_sloppy = link_recon;  
  qudaGaugeParam.type = QUDA_SU3_LINKS; // in this context, just means these are site links   
  
  qudaGaugeParam.gauge_order = gauge_order;
  qudaGaugeParam.t_boundary = QUDA_PERIODIC_T;
  qudaGaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO;
  qudaGaugeParam.ga_pad = 0;
  qudaGaugeParam.mom_ga_pad = 0;

  size_t gSize = qudaGaugeParam.cpu_prec;
    
  void* sitelink;  
  void* sitelink_1d;
  
#ifdef GPU_DIRECT
  sitelink_1d = pinned_malloc(4*V*gaugeSiteSize*gSize);
#else
  sitelink_1d = safe_malloc(4*V*gaugeSiteSize*gSize);
#endif
  
  // this is a hack to have site link generated in 2d 
  // then copied to 1d array in "MILC" format
  void* sitelink_2d[4];
#ifdef GPU_DIRECT
  for(int i=0;i<4;i++) sitelink_2d[i] = pinned_malloc(V*gaugeSiteSize*qudaGaugeParam.cpu_prec); 
#else
  for(int i=0;i<4;i++) sitelink_2d[i] = safe_malloc(V*gaugeSiteSize*qudaGaugeParam.cpu_prec);
#endif
  
  // fills the gauge field with random numbers
  createSiteLinkCPU(sitelink_2d, qudaGaugeParam.cpu_prec, 0);
  
  //copy the 2d sitelink to 1d milc format 
  
  for(int dir = 0; dir < 4; dir++){
    for(int i=0; i < V; i++){
      char* src =  ((char*)sitelink_2d[dir]) + i * gaugeSiteSize* qudaGaugeParam.cpu_prec;
      char* dst =  ((char*)sitelink_1d) + (4*i+dir)*gaugeSiteSize*qudaGaugeParam.cpu_prec ;
      memcpy(dst, src, gaugeSiteSize*qudaGaugeParam.cpu_prec);
    }
  }
  if (qudaGaugeParam.gauge_order ==  QUDA_MILC_GAUGE_ORDER){ 
    sitelink =  sitelink_1d;    
  }else if (qudaGaugeParam.gauge_order == QUDA_QDP_GAUGE_ORDER) {
    sitelink = (void**)sitelink_2d;
  } else {
    errorQuda("Unsupported gauge order %d", qudaGaugeParam.gauge_order);
  }
  
#ifdef MULTI_GPU
  void* sitelink_ex_2d[4];
  void* sitelink_ex_1d;

  sitelink_ex_1d = pinned_malloc(4*V_ex*gaugeSiteSize*gSize);
  for(int i=0;i < 4;i++) sitelink_ex_2d[i] = pinned_malloc(V_ex*gaugeSiteSize*gSize);

  int X1= Z[0];
  int X2= Z[1];
  int X3= Z[2];
  int X4= Z[3];

  for(int i=0; i < V_ex; i++){
    int sid = i;
    int oddBit=0;
    if(i >= Vh_ex){
      sid = i - Vh_ex;
      oddBit = 1;
    }
    
    int za = sid/E1h;
    int x1h = sid - za*E1h;
    int zb = za/E2;
    int x2 = za - zb*E2;
    int x4 = zb/E3;
    int x3 = zb - x4*E3;
    int x1odd = (x2 + x3 + x4 + oddBit) & 1;
    int x1 = 2*x1h + x1odd;

    if( x1< 2 || x1 >= X1 +2
        || x2< 2 || x2 >= X2 +2
        || x3< 2 || x3 >= X3 +2
        || x4< 2 || x4 >= X4 +2){
      continue;
    }
    
    x1 = (x1 - 2 + X1) % X1;
    x2 = (x2 - 2 + X2) % X2;
    x3 = (x3 - 2 + X3) % X3;
    x4 = (x4 - 2 + X4) % X4;
    
    int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1;
    if(oddBit){
      idx += Vh;
    }
    for(int dir= 0; dir < 4; dir++){
      char* src = (char*)sitelink_2d[dir];
      char* dst = (char*)sitelink_ex_2d[dir];
      memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
    }//dir
  }//i
  
  
  for(int dir = 0; dir < 4; dir++){
    for(int i=0; i < V_ex; i++){
      char* src =  ((char*)sitelink_ex_2d[dir]) + i * gaugeSiteSize* qudaGaugeParam.cpu_prec;
      char* dst =  ((char*)sitelink_ex_1d) + (4*i+dir)*gaugeSiteSize*qudaGaugeParam.cpu_prec ;
      memcpy(dst, src, gaugeSiteSize*qudaGaugeParam.cpu_prec);
    }
  }
  
#endif

  void* mom = safe_malloc(4*V*momSiteSize*gSize);
  void* refmom = safe_malloc(4*V*momSiteSize*gSize);

  memset(mom, 0, 4*V*momSiteSize*gSize);
  //initialize some data in cpuMom
  createMomCPU(mom, qudaGaugeParam.cpu_prec);      
  memcpy(refmom, mom, 4*V*momSiteSize*gSize);
  
  
  double loop_coeff_d[sizeof(loop_coeff_f)/sizeof(float)];
  for(unsigned int i=0;i < sizeof(loop_coeff_f)/sizeof(float); i++){
    loop_coeff_d[i] = loop_coeff_f[i];
  }
    
  void* loop_coeff;
  if(qudaGaugeParam.cuda_prec == QUDA_SINGLE_PRECISION){
    loop_coeff = (void*)&loop_coeff_f[0];
  }else{
    loop_coeff = loop_coeff_d;
  }
  double eb3 = 0.3;
  int num_paths = sizeof(path_dir_x)/sizeof(path_dir_x[0]);
  
  int** input_path_buf[4];
  for(int dir =0; dir < 4; dir++){
    input_path_buf[dir] = (int**)safe_malloc(num_paths*sizeof(int*));    
    for(int i=0;i < num_paths;i++){
      input_path_buf[dir][i] = (int*)safe_malloc(length[i]*sizeof(int));
      if(dir == 0) memcpy(input_path_buf[dir][i], path_dir_x[i], length[i]*sizeof(int));
      else if(dir ==1) memcpy(input_path_buf[dir][i], path_dir_y[i], length[i]*sizeof(int));
      else if(dir ==2) memcpy(input_path_buf[dir][i], path_dir_z[i], length[i]*sizeof(int));
      else if(dir ==3) memcpy(input_path_buf[dir][i], path_dir_t[i], length[i]*sizeof(int));
    }
  }

  if (tune) {
    printfQuda("Tuning...\n");
    setTuning(QUDA_TUNE_YES);
  }
  
  struct timeval t0, t1;
  double timeinfo[3];
  /* Multiple execution to exclude warmup time in the first run*/
  for (int i =0;i < attempts; i++){
    gettimeofday(&t0, NULL);
    computeGaugeForceQuda(mom, sitelink,  input_path_buf, length,
			  loop_coeff_d, num_paths, max_length, eb3,
			  &qudaGaugeParam, timeinfo);
    gettimeofday(&t1, NULL);
  }
 
  double total_time = t1.tv_sec - t0.tv_sec + 0.000001*(t1.tv_usec - t0.tv_usec);
  //The number comes from CPU implementation in MILC, gauge_force_imp.c
  int flops=153004;
    
  if (verify_results){	
    for(int i = 0;i < attempts;i++){
#ifdef MULTI_GPU
      //last arg=0 means no optimization for communication, i.e. exchange data in all directions 
      //even they are not partitioned
      int R[4] = {2, 2, 2, 2};
      exchange_cpu_sitelink_ex(qudaGaugeParam.X, R, (void**)sitelink_ex_2d,
			       QUDA_QDP_GAUGE_ORDER, qudaGaugeParam.cpu_prec, 0, 4);    
      gauge_force_reference(refmom, eb3, sitelink_2d, sitelink_ex_2d, qudaGaugeParam.cpu_prec,
			    input_path_buf, length, loop_coeff, num_paths);
#else
      gauge_force_reference(refmom, eb3, sitelink_2d, NULL, qudaGaugeParam.cpu_prec,
			    input_path_buf, length, loop_coeff, num_paths);
#endif
    }
  
    int res;
    res = compare_floats(mom, refmom, 4*V*momSiteSize, 1e-3, qudaGaugeParam.cpu_prec);
    
    strong_check_mom(mom, refmom, 4*V, qudaGaugeParam.cpu_prec);
    
    printf("Test %s\n",(1 == res) ? "PASSED" : "FAILED");
  }  

  double perf = 1.0* flops*V/(total_time*1e+9);
  double kernel_perf = 1.0*flops*V/(timeinfo[1]*1e+9);
  printf("init and cpu->gpu time: %.2f ms, kernel time: %.2f ms, gpu->cpu and cleanup time: %.2f  total time =%.2f ms\n", 
	 timeinfo[0]*1e+3, timeinfo[1]*1e+3, timeinfo[2]*1e+3, total_time*1e+3);
  printf("kernel performance: %.2f GFLOPS, overall performance : %.2f GFLOPS\n", kernel_perf, perf);
  
  for(int dir = 0; dir < 4; dir++){
    for(int i=0;i < num_paths; i++) host_free(input_path_buf[dir][i]);
    host_free(input_path_buf[dir]);
  }
  
  host_free(sitelink_1d);
  for(int dir=0;dir < 4;dir++) host_free(sitelink_2d[dir]);
  
#ifdef MULTI_GPU  
  host_free(sitelink_ex_1d);
  for(int dir=0; dir < 4; dir++) host_free(sitelink_ex_2d[dir]);
#endif


  host_free(mom);
  host_free(refmom);
  endQuda();
}            
Пример #19
0
void comm_free(void *handle) {
  host_free((MPI_Request*)handle);
}
Пример #20
0
/* Allocate data structures required for new data instance. */
LOCAL IBOOL allocate_NIDDB IFN2
   (
   IU32,  inst_handle,	/* (I ) Windows handle for Virtual Machine */
   int *, record_id	/* ( 0) Record ID (ie virtualising byte value) */
   )
   {
   int v;
   int i;
   IHP *p;
   IHP *instance_ptr;

   /* Search for empty virtual record */
   for (v = 0; v < MAX_VMS; v++)
      {
      if ( vrecs[v].vr_pinst_tbl == (IHP *)0 )
	 break;   /* found empty slot */
      }

   /* Ensure we found empty slot */
   if ( v == MAX_VMS )
      {
      /* No free slot! */
      always_trace0("NIDDB: Too many Virtual Machines being requested.");
      return FALSE;
      }

   /* Allocate new instance table - ensure it is zero */
   if ( (instance_ptr = (IHP *)host_calloc(1, sizeof(master_ptrs))) == (IHP *)0 )
      {
      /* No room at the inn */
      return FALSE;
      }

   /* Allocate new data areas */
   for (i = 0, p = instance_ptr; i < MAX_INSTANCES; i++, p++)
      {
      /* Use master pointer as the 'creation template' */
      if ( master_ptrs[i] != (IHP *)0 )
	 {
	 if ( (*p = (IHP)host_malloc(instance_size[i])) == (IHP)0 )
	    {
	    /* No room at the inn */

	    /* Clean up any blocks which may have been allocated */
	    for (i = 0, p = instance_ptr; i < MAX_INSTANCES; i++, p++)
	       {
	       if ( *p != (IHP)0 )
		  host_free(*p);
	       }

	    return FALSE;
	    }
	 }
      }

   /* Finally fill in virtual record */
   vrecs[v].vr_inst_handle = inst_handle;
   vrecs[v].vr_pinst_tbl = instance_ptr;
   *record_id = v;
   return TRUE;
   }
Пример #21
0
  void loadLinkToGPU(cudaGaugeField* cudaGauge, cpuGaugeField* cpuGauge, QudaGaugeParam* param)
  {
    if (cudaGauge->Precision() != cpuGauge->Precision()){
      errorQuda("Mismatch between CPU precision and CUDA precision");
    }
    QudaPrecision prec = cudaGauge->Precision();

#ifdef MULTI_GPU
    const int* Z = cudaGauge->X();
#endif
    int pad = cudaGauge->Pad();
    int Vsh_x = param->X[1]*param->X[2]*param->X[3]/2;
    int Vsh_y = param->X[0]*param->X[2]*param->X[3]/2;
    int Vsh_z = param->X[0]*param->X[1]*param->X[3]/2;
    int Vsh_t = param->X[0]*param->X[1]*param->X[2]/2;

    static void* ghost_cpuGauge[4];
    static void* ghost_cpuGauge_diag[16];

#ifdef MULTI_GPU
    static int allocated = 0;
    int Vs[4] = {2*Vsh_x, 2*Vsh_y, 2*Vsh_z, 2*Vsh_t};
  
    if (!allocated) {

      for(int i=0;i < 4; i++) {
	size_t ghost_bytes = 8*Vs[i]*gaugeSiteSize*prec;      
#ifdef GPU_DIRECT 
	ghost_cpuGauge[i] = pinned_malloc(ghost_bytes);
#else
	ghost_cpuGauge[i] = safe_malloc(ghost_bytes);
#endif
      }

      /*
       *  nu |     |
       *     |_____|
       *       mu     
       */
    
      for(int nu=0;nu < 4;nu++){
	for(int mu=0; mu < 4;mu++){
	  if(nu == mu){
	    ghost_cpuGauge_diag[nu*4+mu] = NULL;
	  }else{
	    //the other directions
	    int dir1, dir2;
	    for(dir1= 0; dir1 < 4; dir1++){
	      if(dir1 !=nu && dir1 != mu){
		break;
	      }
	    }
	    for(dir2=0; dir2 < 4; dir2++){
	      if(dir2 != nu && dir2 != mu && dir2 != dir1){
		break;
	      }
	    }
	    //int rc = posix_memalign((void**)&ghost_cpuGauge_diag[nu*4+mu], ALIGNMENT, Z[dir1]*Z[dir2]*gaugeSiteSize*prec);

	    size_t nbytes = Z[dir1]*Z[dir2]*gaugeSiteSize*prec;
#ifdef GPU_DIRECT 
	    ghost_cpuGauge_diag[nu*4+mu] = pinned_malloc(nbytes);
#else
	    ghost_cpuGauge_diag[nu*4+mu] = safe_malloc(nbytes);
#endif
	    memset(ghost_cpuGauge_diag[nu*4+mu], 0, nbytes);
	  }	
	}
      }
      allocated = 1;
    }

    int optflag=1;
    // driver for for packalllink
    exchange_cpu_sitelink(param->X, (void**)cpuGauge->Gauge_p(), ghost_cpuGauge, ghost_cpuGauge_diag, prec, param, optflag);
  
#endif
  
    do_loadLinkToGPU(param->X, cudaGauge->Even_p(), cudaGauge->Odd_p(), (void**)cpuGauge->Gauge_p(), 
		     ghost_cpuGauge, ghost_cpuGauge_diag, 
		     cudaGauge->Reconstruct(), cudaGauge->Bytes(), cudaGauge->VolumeCB(), pad, 
		     Vsh_x, Vsh_y, Vsh_z, Vsh_t, 
		     prec, cpuGauge->Order());
  
#ifdef MULTI_GPU
    if(!(param->preserve_gauge & QUDA_FAT_PRESERVE_COMM_MEM)) {
    
      for(int i=0;i < 4;i++){
	host_free(ghost_cpuGauge[i]);
      }
      for(int i=0;i <4; i++){ 
	for(int j=0;j <4; j++){
	  if (i != j) host_free(ghost_cpuGauge_diag[i*4+j]);
	}
      }    
      allocated = 0;
    }
#endif
  
  }
Пример #22
0
 CloverField::~CloverField() {
   host_free(trlog);
 }
Пример #23
0
void
host_shutdown(void)
{
    host_save();
    host_free();
}
Пример #24
0
static int mb_config_add_host (oconfig_item_t *ci) /* {{{ */
{
  mb_host_t *host;
  int status;
  int i;

  host = malloc (sizeof (*host));
  if (host == NULL)
    return (ENOMEM);
  memset (host, 0, sizeof (*host));
  host->slaves = NULL;

  status = cf_util_get_string_buffer (ci, host->host, sizeof (host->host));
  if (status != 0)
    return (status);
  if (host->host[0] == 0)
    return (EINVAL);

  for (i = 0; i < ci->children_num; i++)
  {
    oconfig_item_t *child = ci->children + i;
    status = 0;

    if (strcasecmp ("Address", child->key) == 0)
    {
      char buffer[NI_MAXHOST];
      status = cf_util_get_string_buffer (child, buffer, sizeof (buffer));
      if (status == 0)
        status = mb_config_set_host_address (host, buffer);
    }
    else if (strcasecmp ("Port", child->key) == 0)
    {
      host->port = cf_util_get_port_number (child);
      if (host->port <= 0)
        status = -1;
    }
    else if (strcasecmp ("Interval", child->key) == 0)
      status = cf_util_get_cdtime (child, &host->interval);
    else if (strcasecmp ("Slave", child->key) == 0)
      /* Don't set status: Gracefully continue if a slave fails. */
      mb_config_add_slave (host, child);
    else
    {
      ERROR ("Modbus plugin: Unknown configuration option: %s", child->key);
      status = -1;
    }

    if (status != 0)
      break;
  } /* for (i = 0; i < ci->children_num; i++) */

  assert (host->host[0] != 0);
  if (host->host[0] == 0)
  {
    ERROR ("Modbus plugin: Data block \"%s\": No type has been specified.",
        host->host);
    status = -1;
  }

  if (status == 0)
  {
    user_data_t ud;
    char name[1024];
    struct timespec interval = { 0, 0 };

    ud.data = host;
    ud.free_func = host_free;

    ssnprintf (name, sizeof (name), "modbus-%s", host->host);

    CDTIME_T_TO_TIMESPEC (host->interval, &interval);

    plugin_register_complex_read (/* group = */ NULL, name,
        /* callback = */ mb_read,
        /* interval = */ (host->interval > 0) ? &interval : NULL,
        &ud);
  }
  else
  {
    host_free (host);
  }

  return (status);
} /* }}} int mb_config_add_host */
Пример #25
0
void comm_free(MsgHandle *mh)
{
  host_free(mh);
}
Пример #26
0
void comm_free(MsgHandle *mh)
{
  QMP_free_msghandle(mh->handle);
  QMP_free_msgmem(mh->mem);
  host_free(mh);
}
Пример #27
0
void comm_destroy_topology(Topology *topo)
{
  host_free(topo->ranks);
  host_free(topo->coords);
  host_free(topo);
}