Exemple #1
0
 cpuCloverField::~cpuCloverField() { 
   if (create != QUDA_REFERENCE_FIELD_CREATE) {
     if (clover) host_free(clover);
     if (norm) host_free(norm);
     if (cloverInv) host_free(cloverInv);
     if (invNorm) host_free(invNorm);      
   }
 }
Exemple #2
0
static int 
getNumaAffinity(int my_gpu, int *cpu_cores, int* ncores)
{
  FILE *nvidia_info, *pci_bus_info;
  size_t nbytes = 255;
  char *my_line;
  char nvidia_info_path[255], pci_bus_info_path[255];
  char bus_info[255];
  
  // the nvidia driver populates this path for each gpu
  sprintf(nvidia_info_path,"/proc/driver/nvidia/gpus/%d/information", my_gpu);
  nvidia_info= fopen(nvidia_info_path,"r");
  if (nvidia_info == NULL){
    return -1;
  }
  
  my_line= (char *) safe_malloc(nbytes +1);
  
  while (!feof(nvidia_info)){
    if ( -1 == getline(&my_line, &nbytes, nvidia_info)){
      break;
    }else{
      // the first 7 char of the Bus Location will lead to the corresponding
      // path under /sys/class/pci_bus/  , cpulistaffinity showing cores on that
      // bus is located there
      if ( 1 == sscanf(my_line,"Bus Location: %s", bus_info )){
	sprintf(pci_bus_info_path,"/sys/class/pci_bus/%.7s/cpulistaffinity",
		bus_info);
      }
    }
  }
  // open the cpulistaffinity file on the pci_bus for "my_gpu"
  pci_bus_info= fopen(pci_bus_info_path,"r");
  if (pci_bus_info == NULL){
    //printfQuda("Warning: opening file %s failed\n", pci_bus_info_path);
    host_free(my_line);
    fclose(nvidia_info);
    return -1;
  }
  
  while (!feof(pci_bus_info)){
    if ( -1 == getline(&my_line, &nbytes, pci_bus_info)){
      break;
    } else{
      int rc = process_core_string_list(my_line, cpu_cores, ncores);
      if(rc < 0){
	warningQuda("Failed to process the line \"%s\"", my_line);
	host_free(my_line);
	fclose(nvidia_info);
	return  -1;
      }
    }
  }
  
  host_free(my_line);
  return 0;
}
  void cpuColorSpinorField::freeGhostBuffer(void)
  {
    if(!initGhostFaceBuffer) return;

    for(int i=0;i < 4; i++){
      host_free(fwdGhostFaceBuffer[i]); fwdGhostFaceBuffer[i] = NULL;
      host_free(backGhostFaceBuffer[i]); backGhostFaceBuffer[i] = NULL;
      host_free(fwdGhostFaceSendBuffer[i]); fwdGhostFaceSendBuffer[i] = NULL;
      host_free(backGhostFaceSendBuffer[i]);  backGhostFaceSendBuffer[i] = NULL;
    } 
    initGhostFaceBuffer = 0;
  }
  void cpuColorSpinorField::destroy() {
  
    if (init) {
      if (fieldOrder == QUDA_QOP_DOMAIN_WALL_FIELD_ORDER) 
	for (int i=0; i<x[nDim-1]; i++) host_free(((void**)v)[i]);
      host_free(v);
      init = false;
    }

    if (siteSubset == QUDA_FULL_SITE_SUBSET) {
      if (even) delete even;
      if (odd) delete odd;
    }

  }
Exemple #5
0
void static loadParityHw(ParityHw ret, void *hw, QudaPrecision cpu_prec)
{
    if (ret.precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) {
        errorQuda("CUDA double precision requires CPU double precision");
    }

    if (ret.precision != QUDA_HALF_PRECISION) {

        void *packedHw1 = pinned_malloc(ret.bytes);

        if (ret.precision == QUDA_DOUBLE_PRECISION) {
            packParityHw((double2*)packedHw1, (double*)hw, ret.volume);
        } else {
            if (cpu_prec == QUDA_DOUBLE_PRECISION) {
                packParityHw((float2*)packedHw1, (double*)hw, ret.volume);
            } else {
                packParityHw((float2*)packedHw1, (float*)hw, ret.volume);
            }
        }
        cudaMemcpy(ret.data, packedHw1, ret.bytes, cudaMemcpyHostToDevice);
        host_free(packedHw1);

    } else {
        //half precision
        /*
          ParityHw tmp = allocateParityHw(ret.X, QUDA_SINGLE_PRECISION);
          loadParityHw(tmp, hw, cpu_prec, dirac_order);
          copyCuda(ret, tmp);
          freeParityHw(tmp);
        */
    }
}
Exemple #6
0
static void retrieveParityHw(void *res, ParityHw hw, QudaPrecision cpu_prec)
{
    if (hw.precision != QUDA_HALF_PRECISION) {

        void *packedHw1 = pinned_malloc(hw.bytes);
        cudaMemcpy(packedHw1, hw.data, hw.bytes, cudaMemcpyDeviceToHost);

        if (hw.precision == QUDA_DOUBLE_PRECISION) {
            unpackParityHw((double*)res, (double2*)packedHw1, hw.volume);
        } else {
            if (cpu_prec == QUDA_DOUBLE_PRECISION) {
                unpackParityHw((double*)res, (float2*)packedHw1, hw.volume);
            } else {
                unpackParityHw((float*)res, (float2*)packedHw1, hw.volume);
            }
        }
        host_free(packedHw1);

    } else {
        //half precision
        /*
          ParityHw tmp = allocateParityHw(hw.X, QUDA_SINGLE_PRECISION);
          copyCuda(tmp, hw);
          retrieveParityHw(res, tmp, cpu_prec, dirac_order);
          freeParityHw(tmp);
        */
    }
}
  void cpuColorSpinorField::destroy() {
  
    if (precision == QUDA_DOUBLE_PRECISION) {
      delete order_double;
    } else if (precision == QUDA_SINGLE_PRECISION) {
      delete order_single;
    } else {
      errorQuda("Precision %d not supported", precision);
    }
  
    if (init) {
      if (fieldOrder == QUDA_QOP_DOMAIN_WALL_FIELD_ORDER) 
	for (int i=0; i<x[nDim-1]; i++) host_free(((void**)v)[i]);
      host_free(v);
      init = false;
    }

  }
Exemple #8
0
void 
comm_init(void)
{
  int i;
  
  static int firsttime=1;
  if (!firsttime) return;
  firsttime = 0;

  gethostname(hostname, 128);
  hostname[127] = '\0';

  MPI_Comm_size(MPI_COMM_WORLD, &size);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);

  int gpus_per_node = getGpuCount();  

  comm_partition();

  back_nbr = (rank -1 + size)%size;
  fwd_nbr = (rank +1)%size;
  num_nodes=size / getGpuCount();
  if(num_nodes ==0) {
	num_nodes=1;
  }

  //determine which gpu this MPI process is going to use
  char* hostname_recv_buf = (char*)safe_malloc(128*size);
  
  int rc = MPI_Allgather(hostname, 128, MPI_CHAR, hostname_recv_buf, 128, MPI_CHAR, MPI_COMM_WORLD);
  if (rc != MPI_SUCCESS){
    printf("ERROR: MPI_Allgather failed for hostname\n");
    comm_exit(1);
  }

  which_gpu=0;
  for(i=0;i < size; i++){
    if (i == rank){
      break;
    }
    if (strncmp(hostname, hostname_recv_buf + 128*i, 128) == 0){
      which_gpu ++;
    }
  }
  
  if (which_gpu >= gpus_per_node){
    printf("ERROR: invalid gpu(%d) to use in rank=%d mpi process\n", which_gpu, rank);
    comm_exit(1);
  }
  
  srand(rank*999);
  
  host_free(hostname_recv_buf);
  return;
}
  cpuGaugeField::~cpuGaugeField()
  {
    if (create == QUDA_NULL_FIELD_CREATE || create == QUDA_ZERO_FIELD_CREATE) {
      if (order == QUDA_QDP_GAUGE_ORDER) {
	for (int d=0; d<nDim; d++) {
	  if (gauge[d]) host_free(gauge[d]);
	}
	if (gauge) host_free(gauge);
      } else {
	if (gauge) host_free(gauge);
      }
    } else { // QUDA_REFERENCE_FIELD_CREATE 
      if (order == QUDA_QDP_GAUGE_ORDER){
	if (gauge) host_free(gauge);
      }
    }
  
    if (link_type != QUDA_ASQTAD_MOM_LINKS) {
      for (int i=0; i<nDim; i++) {
	if (ghost[i]) host_free(ghost[i]);
      }
    }
  }
Exemple #10
0
/* Deallocate all data and structues for given data instance. */
LOCAL void delete_NIDDB IFN1
   (
   int, record_id	/* (I ) Record ID (ie virtualising byte value) */
   )
   {
   int i;
   IHP *instance_ptr;

   /* Ensure instance is active (for terminate callback). */
   swap_NIDDB(record_id);

   instance_ptr = vrecs[record_id].vr_pinst_tbl;

   /* Deallocate all data areas in instance_ptr table. */
   for (i = 0; i < MAX_INSTANCES; i++, instance_ptr++)
      {
      if ( *instance_ptr != (IHP)0 )
	 {
	 /* Action any terminate callback */
	 if ( terminate_callback[i] != (NIDDB_TM_CALLBACK)0 )
	    {
	    (terminate_callback[i])();
	    }

	 /* free up memory */
	 host_free(*instance_ptr);
	 }
      }

   /* Deallocate the instance_ptr table itself */
   host_free((IHP)vrecs[record_id].vr_pinst_tbl);

   /* Initialise virtual record entry */
   vrecs[record_id].vr_pinst_tbl = (IHP *)0;
   vrecs[record_id].vr_inst_handle = (IU32)0;
   }
Exemple #11
0
void comm_init(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data)
{
  int initialized;
  MPI_CHECK( MPI_Initialized(&initialized) );

  if (!initialized) {
    errorQuda("MPI has not been initialized");
  }

  MPI_CHECK( MPI_Comm_rank(MPI_COMM_WORLD, &rank) );
  MPI_CHECK( MPI_Comm_size(MPI_COMM_WORLD, &size) );

  int grid_size = 1;
  for (int i = 0; i < ndim; i++) {
    grid_size *= dims[i];
  }
  if (grid_size != size) {
    errorQuda("Communication grid size declared via initCommsGridQuda() does not match"
              " total number of MPI ranks (%d != %d)", grid_size, size);
  }

  Topology *topo = comm_create_topology(ndim, dims, rank_from_coords, map_data);
  comm_set_default_topology(topo);

  // determine which GPU this MPI rank will use
  char *hostname = comm_hostname();
  char *hostname_recv_buf = (char *)safe_malloc(128*size);
  
  MPI_CHECK( MPI_Allgather(hostname, 128, MPI_CHAR, hostname_recv_buf, 128, MPI_CHAR, MPI_COMM_WORLD) );

  gpuid = 0;
  for (int i = 0; i < rank; i++) {
    if (!strncmp(hostname, &hostname_recv_buf[128*i], 128)) {
      gpuid++;
    }
  }
  host_free(hostname_recv_buf);

  int device_count;
  cudaGetDeviceCount(&device_count);
  if (device_count == 0) {
    errorQuda("No CUDA devices found");
  }
  if (gpuid >= device_count) {
    errorQuda("Too few GPUs available on %s", hostname);
  }
}
Exemple #12
0
  // This does the exchange of the gauge field ghost zone and places it
  // into the ghost array.
  void cpuGaugeField::exchangeGhost() {
    if (ghostExchange) return;

    void *send[QUDA_MAX_DIM];
    for (int d=0; d<nDim; d++) send[d] = safe_malloc(nFace*surface[d]*reconstruct*precision);

    // get the links into contiguous buffers
    extractGaugeGhost(*this, send);

    // communicate between nodes
    FaceBuffer faceBuf(x, nDim, reconstruct, nFace, precision);
    faceBuf.exchangeLink(ghost, send, QUDA_CPU_FIELD_LOCATION);

    for (int d=0; d<nDim; d++) host_free(send[d]);

    ghostExchange = true;
  }
Exemple #13
0
void
host_load(void)
{
    FILE *f;
    int ip;
    char name[80];
    char *p = name;

    if (!(f = fopen("nethost.cache", "r")))
        return;

    if (hostcache_list)
        host_free();

    while (fscanf(f, "%x %s\n", &ip, p) == 2)
        host_add(ip, name);

    fclose(f);
}
status_t CameraAcc::acc_read_fw(const char* filename, fw_info &fw)
{
    LOG1("@%s", __FUNCTION__);

    FILE* file;
    if (!filename)
        return UNKNOWN_ERROR;

    LOG1("filename=%s", filename);
    fw.size = 0;
    fw.data = NULL;

    file = fopen(filename, "rb");
    if (!file)
        return UNKNOWN_ERROR;

    fseek(file, 0, SEEK_END);
    fw.size = ftell(file);
    fseek(file, 0, SEEK_SET);

    if (fw.size == 0) {
        fclose(file);
        return UNKNOWN_ERROR;
    }

    fw.data = host_alloc(fw.size);
    if (fw.data == NULL) {
        fclose(file);
        return UNKNOWN_ERROR;
    }

    if (fread(fw.data, 1, fw.size, file) != fw.size) {
        fclose(file);
        host_free(fw.data);
        return UNKNOWN_ERROR;
    }

    fclose(file);

    return NO_ERROR;
}
  void cpuColorSpinorField::exchangeGhost(QudaParity parity, int nFace, int dagger, const MemoryLocation *dummy1,
					  const MemoryLocation *dummy2, bool dummy3, bool dummy4) const
  {
    // allocate ghost buffer if not yet allocated
    allocateGhostBuffer(nFace);

    void **sendbuf = static_cast<void**>(safe_malloc(nDimComms * 2 * sizeof(void*)));

    for (int i=0; i<nDimComms; i++) {
      sendbuf[2*i + 0] = backGhostFaceSendBuffer[i];
      sendbuf[2*i + 1] = fwdGhostFaceSendBuffer[i];
      ghost_buf[2*i + 0] = backGhostFaceBuffer[i];
      ghost_buf[2*i + 1] = fwdGhostFaceBuffer[i];
    }

    packGhost(sendbuf, parity, nFace, dagger);

    exchange(ghost_buf, sendbuf, nFace);

    host_free(sendbuf);
  }
Exemple #16
0
/* Deallocate per Virtual Machine data area for Device Driver. */
GLOBAL void NIDDB_Deallocate_Instance_Data IFN1
   (
   IHP *, handle	/* Handle to data area */
   )
   {
   int i;

   if ( !allocation_allowed )
      {
      /*
	 We are still managing instances for Windows, or at least we think we 
	 are. Has the user escaped from Windows without our VxD being informed,
	 or is the Insignia Device Driver giving us a bum steer?
       */
      always_trace0("NIDDB: Unexpected call to NIDDB_Deallocate_Instance_Data.");

      /* We might give the Insignia Device Driver the benefit of the doubt and
	 act like a System_Exit message. - Then again we might just ignore em. */
      return;
      }

   /* Find index to master_ptrs, etc. */
   i = handle - &master_ptrs[0];
   if ( i < 0 || i >= MAX_INSTANCES )
      {
      always_trace0("NIDDB: Bad handle passed to NIDDB_Deallocate_Instance_Data.");
      return;
      }

   /* Free data area */
   host_free(master_ptrs[i]);

   /* Initialise entry */
   master_ptrs[i] = snapshot_ptrs[i] = (IHP)0;
   instance_size[i] = (IU32)0;
   create_callback[i] = (NIDDB_CR_CALLBACK)0;
   terminate_callback[i] = (NIDDB_TM_CALLBACK)0;

   return;
   }
/*
 * UMA backend page allocator for the jumbo frame zones.
 *
 * Allocates kernel virtual memory that is backed by contiguous physical
 * pages.
 */
static void *
mbuf_jumbo_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
{

	/* Inform UMA that this allocator uses kernel_map/object. */
	void *p;	/* Returned page */
	struct vm_page_list * page_list;
	vm_page_t     pages;
	unsigned long  size, page_num, pfn;
	int i;

	size= round_page(bytes);
	page_num = size >> PAGE_SHIFT;

	*flags = UMA_SLAB_FREEBSD_KERNEL;
	p = (void *)host_malloc(size, PAGE_SIZE); // kmem_malloc(kmem_map, bytes, wait);
	pfn = ((unsigned long)p) >> PAGE_SHIFT;

	if(p != NULL){
		pages = (vm_page_t)host_malloc(sizeof(struct vm_page) * page_num, -1);
		if(pages != NULL){
			for(i = 0; i < page_num; i ++, pfn ++){
				page_list = &page_slab_hash[pfn % MAX_UPTCP_PAGENUM];
				pages[i].page_addr = (uint8_t*)(pfn << PAGE_SHIFT);
				pages[i].flags = 0;
				pages[i].object= NULL;
				SLIST_INSERT_HEAD(page_list, &pages[i], page_link);
			}
		} else {
			host_free(p);
			return NULL;
		}
	}

	return (p);
	
}
Exemple #18
0
static void
gauge_force_test(void) 
{
  int max_length = 6;    
  
  initQuda(device);
  setVerbosityQuda(QUDA_VERBOSE,"",stdout);

  qudaGaugeParam = newQudaGaugeParam();
  
  qudaGaugeParam.X[0] = xdim;
  qudaGaugeParam.X[1] = ydim;
  qudaGaugeParam.X[2] = zdim;
  qudaGaugeParam.X[3] = tdim;
  
  setDims(qudaGaugeParam.X);
  
  qudaGaugeParam.anisotropy = 1.0;
  qudaGaugeParam.cpu_prec = link_prec;
  qudaGaugeParam.cuda_prec = link_prec;
  qudaGaugeParam.cuda_prec_sloppy = link_prec;
  qudaGaugeParam.reconstruct = link_recon;  
  qudaGaugeParam.reconstruct_sloppy = link_recon;  
  qudaGaugeParam.type = QUDA_SU3_LINKS; // in this context, just means these are site links   
  
  qudaGaugeParam.gauge_order = gauge_order;
  qudaGaugeParam.t_boundary = QUDA_PERIODIC_T;
  qudaGaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO;
  qudaGaugeParam.ga_pad = 0;
  qudaGaugeParam.mom_ga_pad = 0;

  size_t gSize = qudaGaugeParam.cpu_prec;
    
  void* sitelink;  
  void* sitelink_1d;
  
#ifdef GPU_DIRECT
  sitelink_1d = pinned_malloc(4*V*gaugeSiteSize*gSize);
#else
  sitelink_1d = safe_malloc(4*V*gaugeSiteSize*gSize);
#endif
  
  // this is a hack to have site link generated in 2d 
  // then copied to 1d array in "MILC" format
  void* sitelink_2d[4];
#ifdef GPU_DIRECT
  for(int i=0;i<4;i++) sitelink_2d[i] = pinned_malloc(V*gaugeSiteSize*qudaGaugeParam.cpu_prec); 
#else
  for(int i=0;i<4;i++) sitelink_2d[i] = safe_malloc(V*gaugeSiteSize*qudaGaugeParam.cpu_prec);
#endif
  
  // fills the gauge field with random numbers
  createSiteLinkCPU(sitelink_2d, qudaGaugeParam.cpu_prec, 0);
  
  //copy the 2d sitelink to 1d milc format 
  
  for(int dir = 0; dir < 4; dir++){
    for(int i=0; i < V; i++){
      char* src =  ((char*)sitelink_2d[dir]) + i * gaugeSiteSize* qudaGaugeParam.cpu_prec;
      char* dst =  ((char*)sitelink_1d) + (4*i+dir)*gaugeSiteSize*qudaGaugeParam.cpu_prec ;
      memcpy(dst, src, gaugeSiteSize*qudaGaugeParam.cpu_prec);
    }
  }
  if (qudaGaugeParam.gauge_order ==  QUDA_MILC_GAUGE_ORDER){ 
    sitelink =  sitelink_1d;    
  }else if (qudaGaugeParam.gauge_order == QUDA_QDP_GAUGE_ORDER) {
    sitelink = (void**)sitelink_2d;
  } else {
    errorQuda("Unsupported gauge order %d", qudaGaugeParam.gauge_order);
  }
  
#ifdef MULTI_GPU
  void* sitelink_ex_2d[4];
  void* sitelink_ex_1d;

  sitelink_ex_1d = pinned_malloc(4*V_ex*gaugeSiteSize*gSize);
  for(int i=0;i < 4;i++) sitelink_ex_2d[i] = pinned_malloc(V_ex*gaugeSiteSize*gSize);

  int X1= Z[0];
  int X2= Z[1];
  int X3= Z[2];
  int X4= Z[3];

  for(int i=0; i < V_ex; i++){
    int sid = i;
    int oddBit=0;
    if(i >= Vh_ex){
      sid = i - Vh_ex;
      oddBit = 1;
    }
    
    int za = sid/E1h;
    int x1h = sid - za*E1h;
    int zb = za/E2;
    int x2 = za - zb*E2;
    int x4 = zb/E3;
    int x3 = zb - x4*E3;
    int x1odd = (x2 + x3 + x4 + oddBit) & 1;
    int x1 = 2*x1h + x1odd;

    if( x1< 2 || x1 >= X1 +2
        || x2< 2 || x2 >= X2 +2
        || x3< 2 || x3 >= X3 +2
        || x4< 2 || x4 >= X4 +2){
      continue;
    }
    
    x1 = (x1 - 2 + X1) % X1;
    x2 = (x2 - 2 + X2) % X2;
    x3 = (x3 - 2 + X3) % X3;
    x4 = (x4 - 2 + X4) % X4;
    
    int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1;
    if(oddBit){
      idx += Vh;
    }
    for(int dir= 0; dir < 4; dir++){
      char* src = (char*)sitelink_2d[dir];
      char* dst = (char*)sitelink_ex_2d[dir];
      memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
    }//dir
  }//i
  
  
  for(int dir = 0; dir < 4; dir++){
    for(int i=0; i < V_ex; i++){
      char* src =  ((char*)sitelink_ex_2d[dir]) + i * gaugeSiteSize* qudaGaugeParam.cpu_prec;
      char* dst =  ((char*)sitelink_ex_1d) + (4*i+dir)*gaugeSiteSize*qudaGaugeParam.cpu_prec ;
      memcpy(dst, src, gaugeSiteSize*qudaGaugeParam.cpu_prec);
    }
  }
  
#endif

  void* mom = safe_malloc(4*V*momSiteSize*gSize);
  void* refmom = safe_malloc(4*V*momSiteSize*gSize);

  memset(mom, 0, 4*V*momSiteSize*gSize);
  //initialize some data in cpuMom
  createMomCPU(mom, qudaGaugeParam.cpu_prec);      
  memcpy(refmom, mom, 4*V*momSiteSize*gSize);
  
  
  double loop_coeff_d[sizeof(loop_coeff_f)/sizeof(float)];
  for(unsigned int i=0;i < sizeof(loop_coeff_f)/sizeof(float); i++){
    loop_coeff_d[i] = loop_coeff_f[i];
  }
    
  void* loop_coeff;
  if(qudaGaugeParam.cuda_prec == QUDA_SINGLE_PRECISION){
    loop_coeff = (void*)&loop_coeff_f[0];
  }else{
    loop_coeff = loop_coeff_d;
  }
  double eb3 = 0.3;
  int num_paths = sizeof(path_dir_x)/sizeof(path_dir_x[0]);
  
  int** input_path_buf[4];
  for(int dir =0; dir < 4; dir++){
    input_path_buf[dir] = (int**)safe_malloc(num_paths*sizeof(int*));    
    for(int i=0;i < num_paths;i++){
      input_path_buf[dir][i] = (int*)safe_malloc(length[i]*sizeof(int));
      if(dir == 0) memcpy(input_path_buf[dir][i], path_dir_x[i], length[i]*sizeof(int));
      else if(dir ==1) memcpy(input_path_buf[dir][i], path_dir_y[i], length[i]*sizeof(int));
      else if(dir ==2) memcpy(input_path_buf[dir][i], path_dir_z[i], length[i]*sizeof(int));
      else if(dir ==3) memcpy(input_path_buf[dir][i], path_dir_t[i], length[i]*sizeof(int));
    }
  }

  if (tune) {
    printfQuda("Tuning...\n");
    setTuning(QUDA_TUNE_YES);
  }
  
  struct timeval t0, t1;
  double timeinfo[3];
  /* Multiple execution to exclude warmup time in the first run*/
  for (int i =0;i < attempts; i++){
    gettimeofday(&t0, NULL);
    computeGaugeForceQuda(mom, sitelink,  input_path_buf, length,
			  loop_coeff_d, num_paths, max_length, eb3,
			  &qudaGaugeParam, timeinfo);
    gettimeofday(&t1, NULL);
  }
 
  double total_time = t1.tv_sec - t0.tv_sec + 0.000001*(t1.tv_usec - t0.tv_usec);
  //The number comes from CPU implementation in MILC, gauge_force_imp.c
  int flops=153004;
    
  if (verify_results){	
    for(int i = 0;i < attempts;i++){
#ifdef MULTI_GPU
      //last arg=0 means no optimization for communication, i.e. exchange data in all directions 
      //even they are not partitioned
      int R[4] = {2, 2, 2, 2};
      exchange_cpu_sitelink_ex(qudaGaugeParam.X, R, (void**)sitelink_ex_2d,
			       QUDA_QDP_GAUGE_ORDER, qudaGaugeParam.cpu_prec, 0, 4);    
      gauge_force_reference(refmom, eb3, sitelink_2d, sitelink_ex_2d, qudaGaugeParam.cpu_prec,
			    input_path_buf, length, loop_coeff, num_paths);
#else
      gauge_force_reference(refmom, eb3, sitelink_2d, NULL, qudaGaugeParam.cpu_prec,
			    input_path_buf, length, loop_coeff, num_paths);
#endif
    }
  
    int res;
    res = compare_floats(mom, refmom, 4*V*momSiteSize, 1e-3, qudaGaugeParam.cpu_prec);
    
    strong_check_mom(mom, refmom, 4*V, qudaGaugeParam.cpu_prec);
    
    printf("Test %s\n",(1 == res) ? "PASSED" : "FAILED");
  }  

  double perf = 1.0* flops*V/(total_time*1e+9);
  double kernel_perf = 1.0*flops*V/(timeinfo[1]*1e+9);
  printf("init and cpu->gpu time: %.2f ms, kernel time: %.2f ms, gpu->cpu and cleanup time: %.2f  total time =%.2f ms\n", 
	 timeinfo[0]*1e+3, timeinfo[1]*1e+3, timeinfo[2]*1e+3, total_time*1e+3);
  printf("kernel performance: %.2f GFLOPS, overall performance : %.2f GFLOPS\n", kernel_perf, perf);
  
  for(int dir = 0; dir < 4; dir++){
    for(int i=0;i < num_paths; i++) host_free(input_path_buf[dir][i]);
    host_free(input_path_buf[dir]);
  }
  
  host_free(sitelink_1d);
  for(int dir=0;dir < 4;dir++) host_free(sitelink_2d[dir]);
  
#ifdef MULTI_GPU  
  host_free(sitelink_ex_1d);
  for(int dir=0; dir < 4; dir++) host_free(sitelink_ex_2d[dir]);
#endif


  host_free(mom);
  host_free(refmom);
  endQuda();
}            
Exemple #19
0
void comm_free(void *handle) {
  host_free((MPI_Request*)handle);
}
Exemple #20
0
/* Allocate data structures required for new data instance. */
LOCAL IBOOL allocate_NIDDB IFN2
   (
   IU32,  inst_handle,	/* (I ) Windows handle for Virtual Machine */
   int *, record_id	/* ( 0) Record ID (ie virtualising byte value) */
   )
   {
   int v;
   int i;
   IHP *p;
   IHP *instance_ptr;

   /* Search for empty virtual record */
   for (v = 0; v < MAX_VMS; v++)
      {
      if ( vrecs[v].vr_pinst_tbl == (IHP *)0 )
	 break;   /* found empty slot */
      }

   /* Ensure we found empty slot */
   if ( v == MAX_VMS )
      {
      /* No free slot! */
      always_trace0("NIDDB: Too many Virtual Machines being requested.");
      return FALSE;
      }

   /* Allocate new instance table - ensure it is zero */
   if ( (instance_ptr = (IHP *)host_calloc(1, sizeof(master_ptrs))) == (IHP *)0 )
      {
      /* No room at the inn */
      return FALSE;
      }

   /* Allocate new data areas */
   for (i = 0, p = instance_ptr; i < MAX_INSTANCES; i++, p++)
      {
      /* Use master pointer as the 'creation template' */
      if ( master_ptrs[i] != (IHP *)0 )
	 {
	 if ( (*p = (IHP)host_malloc(instance_size[i])) == (IHP)0 )
	    {
	    /* No room at the inn */

	    /* Clean up any blocks which may have been allocated */
	    for (i = 0, p = instance_ptr; i < MAX_INSTANCES; i++, p++)
	       {
	       if ( *p != (IHP)0 )
		  host_free(*p);
	       }

	    return FALSE;
	    }
	 }
      }

   /* Finally fill in virtual record */
   vrecs[v].vr_inst_handle = inst_handle;
   vrecs[v].vr_pinst_tbl = instance_ptr;
   *record_id = v;
   return TRUE;
   }
Exemple #21
0
  void loadLinkToGPU(cudaGaugeField* cudaGauge, cpuGaugeField* cpuGauge, QudaGaugeParam* param)
  {
    if (cudaGauge->Precision() != cpuGauge->Precision()){
      errorQuda("Mismatch between CPU precision and CUDA precision");
    }
    QudaPrecision prec = cudaGauge->Precision();

#ifdef MULTI_GPU
    const int* Z = cudaGauge->X();
#endif
    int pad = cudaGauge->Pad();
    int Vsh_x = param->X[1]*param->X[2]*param->X[3]/2;
    int Vsh_y = param->X[0]*param->X[2]*param->X[3]/2;
    int Vsh_z = param->X[0]*param->X[1]*param->X[3]/2;
    int Vsh_t = param->X[0]*param->X[1]*param->X[2]/2;

    static void* ghost_cpuGauge[4];
    static void* ghost_cpuGauge_diag[16];

#ifdef MULTI_GPU
    static int allocated = 0;
    int Vs[4] = {2*Vsh_x, 2*Vsh_y, 2*Vsh_z, 2*Vsh_t};
  
    if (!allocated) {

      for(int i=0;i < 4; i++) {
	size_t ghost_bytes = 8*Vs[i]*gaugeSiteSize*prec;      
#ifdef GPU_DIRECT 
	ghost_cpuGauge[i] = pinned_malloc(ghost_bytes);
#else
	ghost_cpuGauge[i] = safe_malloc(ghost_bytes);
#endif
      }

      /*
       *  nu |     |
       *     |_____|
       *       mu     
       */
    
      for(int nu=0;nu < 4;nu++){
	for(int mu=0; mu < 4;mu++){
	  if(nu == mu){
	    ghost_cpuGauge_diag[nu*4+mu] = NULL;
	  }else{
	    //the other directions
	    int dir1, dir2;
	    for(dir1= 0; dir1 < 4; dir1++){
	      if(dir1 !=nu && dir1 != mu){
		break;
	      }
	    }
	    for(dir2=0; dir2 < 4; dir2++){
	      if(dir2 != nu && dir2 != mu && dir2 != dir1){
		break;
	      }
	    }
	    //int rc = posix_memalign((void**)&ghost_cpuGauge_diag[nu*4+mu], ALIGNMENT, Z[dir1]*Z[dir2]*gaugeSiteSize*prec);

	    size_t nbytes = Z[dir1]*Z[dir2]*gaugeSiteSize*prec;
#ifdef GPU_DIRECT 
	    ghost_cpuGauge_diag[nu*4+mu] = pinned_malloc(nbytes);
#else
	    ghost_cpuGauge_diag[nu*4+mu] = safe_malloc(nbytes);
#endif
	    memset(ghost_cpuGauge_diag[nu*4+mu], 0, nbytes);
	  }	
	}
      }
      allocated = 1;
    }

    int optflag=1;
    // driver for for packalllink
    exchange_cpu_sitelink(param->X, (void**)cpuGauge->Gauge_p(), ghost_cpuGauge, ghost_cpuGauge_diag, prec, param, optflag);
  
#endif
  
    do_loadLinkToGPU(param->X, cudaGauge->Even_p(), cudaGauge->Odd_p(), (void**)cpuGauge->Gauge_p(), 
		     ghost_cpuGauge, ghost_cpuGauge_diag, 
		     cudaGauge->Reconstruct(), cudaGauge->Bytes(), cudaGauge->VolumeCB(), pad, 
		     Vsh_x, Vsh_y, Vsh_z, Vsh_t, 
		     prec, cpuGauge->Order());
  
#ifdef MULTI_GPU
    if(!(param->preserve_gauge & QUDA_FAT_PRESERVE_COMM_MEM)) {
    
      for(int i=0;i < 4;i++){
	host_free(ghost_cpuGauge[i]);
      }
      for(int i=0;i <4; i++){ 
	for(int j=0;j <4; j++){
	  if (i != j) host_free(ghost_cpuGauge_diag[i*4+j]);
	}
      }    
      allocated = 0;
    }
#endif
  
  }
Exemple #22
0
 CloverField::~CloverField() {
   host_free(trlog);
 }
Exemple #23
0
void
host_shutdown(void)
{
    host_save();
    host_free();
}
Exemple #24
0
static int mb_config_add_host (oconfig_item_t *ci) /* {{{ */
{
  mb_host_t *host;
  int status;
  int i;

  host = malloc (sizeof (*host));
  if (host == NULL)
    return (ENOMEM);
  memset (host, 0, sizeof (*host));
  host->slaves = NULL;

  status = cf_util_get_string_buffer (ci, host->host, sizeof (host->host));
  if (status != 0)
    return (status);
  if (host->host[0] == 0)
    return (EINVAL);

  for (i = 0; i < ci->children_num; i++)
  {
    oconfig_item_t *child = ci->children + i;
    status = 0;

    if (strcasecmp ("Address", child->key) == 0)
    {
      char buffer[NI_MAXHOST];
      status = cf_util_get_string_buffer (child, buffer, sizeof (buffer));
      if (status == 0)
        status = mb_config_set_host_address (host, buffer);
    }
    else if (strcasecmp ("Port", child->key) == 0)
    {
      host->port = cf_util_get_port_number (child);
      if (host->port <= 0)
        status = -1;
    }
    else if (strcasecmp ("Interval", child->key) == 0)
      status = cf_util_get_cdtime (child, &host->interval);
    else if (strcasecmp ("Slave", child->key) == 0)
      /* Don't set status: Gracefully continue if a slave fails. */
      mb_config_add_slave (host, child);
    else
    {
      ERROR ("Modbus plugin: Unknown configuration option: %s", child->key);
      status = -1;
    }

    if (status != 0)
      break;
  } /* for (i = 0; i < ci->children_num; i++) */

  assert (host->host[0] != 0);
  if (host->host[0] == 0)
  {
    ERROR ("Modbus plugin: Data block \"%s\": No type has been specified.",
        host->host);
    status = -1;
  }

  if (status == 0)
  {
    user_data_t ud;
    char name[1024];
    struct timespec interval = { 0, 0 };

    ud.data = host;
    ud.free_func = host_free;

    ssnprintf (name, sizeof (name), "modbus-%s", host->host);

    CDTIME_T_TO_TIMESPEC (host->interval, &interval);

    plugin_register_complex_read (/* group = */ NULL, name,
        /* callback = */ mb_read,
        /* interval = */ (host->interval > 0) ? &interval : NULL,
        &ud);
  }
  else
  {
    host_free (host);
  }

  return (status);
} /* }}} int mb_config_add_host */
Exemple #25
0
void comm_free(MsgHandle *mh)
{
  host_free(mh);
}
Exemple #26
0
void comm_free(MsgHandle *mh)
{
  QMP_free_msghandle(mh->handle);
  QMP_free_msgmem(mh->mem);
  host_free(mh);
}
Exemple #27
0
void comm_destroy_topology(Topology *topo)
{
  host_free(topo->ranks);
  host_free(topo->coords);
  host_free(topo);
}