Ejemplo n.º 1
0
void _CUDAMALLOC(int sd,proc_data* data){

  printf("CUDAMALLOC(%d)\n",sd);

  proc* p;
  nvmlReturn_t res;
  nvmlMemory_t mem;
  int devp;

  p = get_proc(sd);
  memcpy(p->data,data,sizeof(proc_data));

  devp = data->pos;

  printf("DEVPOS : %d\n",data->pos);
  printf("\tPID : %d\n",p->data->pid);

  res = nvmlDeviceGetMemoryInfo(dem.devs[devp],&mem);

  if(res != NVML_SUCCESS){
    printf("Failed to get Memory Information\n");
    exit(-1);
  }

  if(mem.free > p->data->req + M64 + dem.flags[devp].reserved){

    printf("\tGOAHEAD (REQ : %lu[MB])\n",p->data->req >> 20);

    MSEND(sd,GOAHEAD,0,0,0,0,0);

    dem.flags[devp].reserved += p->data->req;
    
  }else{
Ejemplo n.º 2
0
// Build the set of device features
static void get_device_features(struct device* dev)
{
  if(nvmlDeviceGetTemperature(dev->handle, NVML_TEMPERATURE_GPU,
                              &dev->temperature) == NVML_SUCCESS) {
    dev->feature_support |= TEMPERATURE;
  }

  if(nvmlDeviceGetMemoryInfo(dev->handle, &dev->memory) == NVML_SUCCESS) {
    dev->feature_support |= MEMORY_INFO;
  }

  if(nvmlDeviceGetPowerUsage(dev->handle, &dev->power_usage) == NVML_SUCCESS) {
    dev->feature_support |= POWER_USAGE;
  }

  if(nvmlDeviceGetClockInfo(dev->handle, NVML_CLOCK_GRAPHICS,
                            &dev->clock[NVML_CLOCK_GRAPHICS]) == NVML_SUCCESS &&

     nvmlDeviceGetClockInfo(dev->handle, NVML_CLOCK_SM,
                            &dev->clock[NVML_CLOCK_SM]) == NVML_SUCCESS &&

     nvmlDeviceGetClockInfo(dev->handle, NVML_CLOCK_COUNT,
                            &dev->clock[NVML_CLOCK_COUNT]) == NVML_SUCCESS) {

    dev->feature_support |= CLOCK_INFO;
  }

  if(nvmlDeviceGetFanSpeed(dev->handle, &dev->fan) == NVML_SUCCESS) {
    dev->feature_support |= FAN_INFO;
  }

  if(nvmlDeviceGetUtilizationRates(dev->handle, &dev->util) == NVML_SUCCESS) {
    dev->feature_support |= UTILIZATION_INFO;
  }
}
Ejemplo n.º 3
0
void dequeueSpecifyProc(proc* p){

  if(p == NULL || p->queued != QUEUED)return;

  nvmlReturn_t res;
  nvmlMemory_t mem;
  int devPos;

  for(devPos = 0 ; devPos < dem.ndev ; devPos ++){

    if(dem.flags[devPos].flag){
      printf("\tdevice[%d] cannot be selected...\n",devPos);
      continue;
    }

    dem.flags[devPos].flag = 1;

    res = nvmlDeviceGetMemoryInfo(dem.devs[devPos],&mem);

    if(res != NVML_SUCCESS){
      printf("Failed to get Memory Info in dequeue\n");
      exit(-1);
    }

    //    if(mem.free > p->data->mem + M64 + dem.flags[devPos].reserved){
    if(mem.free > p->data->mem + p->data->req + M64 + dem.flags[devPos].reserved){

      dem.flags[devPos].sd = p->sd;
      dem.flags[devPos].flag = 1;

      p->queued = ACTIVE;

      if(p->created_context){

	dem.flags[devPos].reserved += p->data->mem + p->data->req;

	MSEND(p->sd,MIGRATE,0,0,devPos,0,0);

      }else{

	dem.flags[devPos].reserved += p->data->sym;

	MSEND(p->sd,CONNECT,0,0,devPos,0,0);

	dem.flags[devPos].sd = -1;
	dem.flags[devPos].flag = 0;

      }

      printf("MIGRATE(%d) to %d\n",p->sd,devPos);

      return;

    }else{

      dem.flags[devPos].flag = 0;

    }
  }
}
Ejemplo n.º 4
0
int probe_gpustats(devstat**stats)
{

    unsigned int n_dev;
    nvmlReturn_t nvret;


    nvret=nvmlInit();
    CHK_NVML(nvret,"Init NVML");


    nvret=nvmlDeviceGetCount(&n_dev);
    CHK_NVML(nvret,"getCount");


    *stats=(devstat*)calloc(n_dev,sizeof(devstat));
    devstat*pstats=*stats;


    int i;
    for(i=0;i<n_dev;i++)
        nvmlDeviceGetHandleByIndex(i,&pstats[i].handler);

    
    for(i=0;i<n_dev;i++)
        nvmlDeviceGetMemoryInfo(pstats[i].handler,&pstats[i].meminfo);
    
    for(i=0;i<n_dev;i++)
        nvmlDeviceGetUtilizationRates(pstats[i].handler,&pstats[i].utils);

    unsigned int sampp;
    for(i=0;i<n_dev;i++)
        nvmlDeviceGetEncoderUtilization(pstats[i].handler,&pstats[i].encutil,&sampp);

    for(i=0;i<n_dev;i++)
        nvmlDeviceGetDecoderUtilization(pstats[i].handler,&pstats[i].decutil,&sampp);
#if 0
    int maxfreeind=0;
    int maxfree=0;
    for(i=0;i<n_dev;i++){

        print_devstats(&pstats[i]);

        int free=pstats[i].meminfo.free; 
//        fprintf(stderr,"<%d\n",free);
        if(free>maxfree){
            maxfree=free;
            maxfreeind=i;
        }

    }
#endif
    nvret=nvmlShutdown();
    CHK_NVML(nvret,"Shutdown NVML");


    return n_dev;
}
Ejemplo n.º 5
0
	void CMeasureNVML<TSkipMs, TVariant>::measure(void *pMsMeasurement, int32_t& rThreadNum) {
		nvmlReturn_t result;
		MS_MEASUREMENT_GPU *pMsMeasurementGpu = (MS_MEASUREMENT_GPU *) pMsMeasurement;
		
		result = nvmlDeviceGetPowerUsage(mDevice, &(pMsMeasurementGpu->nvml_power_cur));
		if (NVML_SUCCESS != result) {
			mrLog.lock();
			mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread #" << rThreadNum << "): Error: no power usage reading possible. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			mrLog.unlock();
			exit(EXIT_FAILURE);
		}
		
		if(TVariant == VARIANT_FULL) {
			nvmlMemory_t memory;
			if(!(mMeasureCounter++ % TSkipMs)) {
				result = nvmlDeviceGetMemoryInfo(mDevice, &memory);
				if (NVML_SUCCESS != result) {
					mrLog.lock();
					mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread #" << rThreadNum << "): Error: cannot obtain memory informations. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
					mrLog.unlock();
					exit(EXIT_FAILURE);
				}
				pMsMeasurementGpu->nvml_memory_free_cur = (uint32_t)(memory.free >> 10);
				pMsMeasurementGpu->nvml_memory_used_cur = (uint32_t)(memory.used >> 10);
				
				result = nvmlDeviceGetPerformanceState(mDevice, (nvmlPstates_t*)&(pMsMeasurementGpu->internal.nvml_power_state));
				if (NVML_SUCCESS != result) {
					mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread #" << rThreadNum << "): Error: no performance state reading possible. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
					exit(EXIT_FAILURE);
				}
			
				nvmlTemperatureSensors_t sensorType = NVML_TEMPERATURE_GPU;
			
				result = nvmlDeviceGetTemperature(mDevice, sensorType, &(pMsMeasurementGpu->nvml_temperature_cur));
				if (NVML_SUCCESS != result) {
					mrLog.lock();
					mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread #" << rThreadNum << "): Error: cannot read temperature. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
					mrLog.unlock();
					exit(EXIT_FAILURE);
				}
			
				result = nvmlDeviceGetClockInfo(mDevice, NVML_CLOCK_SM, &(pMsMeasurementGpu->nvml_clock_sm_cur));
				if (NVML_SUCCESS != result) {
					mrLog.lock();
					mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread #" << rThreadNum << "): Error: cannot read frequency. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
					mrLog.unlock();
					exit(EXIT_FAILURE);
				}
			
				result = nvmlDeviceGetClockInfo(mDevice, NVML_CLOCK_MEM, &(pMsMeasurementGpu->nvml_clock_mem_cur));
				if (NVML_SUCCESS != result) {
					mrLog.lock();
					mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread #" << rThreadNum << "): Error: cannot read frequency. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
					mrLog.unlock();
					exit(EXIT_FAILURE);
				}
			}
Ejemplo n.º 6
0
static int get_mem_info(unsigned int*ncores,unsigned int*usedarray)
{

    nvmlReturn_t ret;
    ret=nvmlInit();


    if(ret!=NVML_SUCCESS) {
        fprintf(stderr,"ERROR:: Initialize NVML{%s}..\n",nvmlErrorString(ret));
        return -1;
    }


    unsigned int c;

    ret=nvmlDeviceGetCount(&c);
    if(ret!=NVML_SUCCESS) {
        fprintf(stderr,"ERROR:: Device Get Count{%s}..\n",nvmlErrorString(ret));
        return -1;
    }

    *ncores=c;

    nvmlDevice_t devs[NDEV];
    nvmlMemory_t meminfo;


    int i;
    for(i=0; i<c; i++) {

        ret=nvmlDeviceGetHandleByIndex(i,&devs[i]);
        if(ret!=NVML_SUCCESS) {
            fprintf(stderr,"ERROR:: Device Get Handle{%s}..\n",nvmlErrorString(ret));
            return -1;
        }

        ret=nvmlDeviceGetMemoryInfo(devs[i],&meminfo);
        if(ret!=NVML_SUCCESS) {
            fprintf(stderr,"ERROR:: GetMemoryInfo{%s}..\n",nvmlErrorString(ret));
            return -1;
        }
        usedarray[i]=meminfo.used;

    }

    ret=nvmlShutdown();

    if(ret!=NVML_SUCCESS) {
        fprintf(stderr,"ERROR:: Shutdown NVML{%s}..\n",nvmlErrorString(ret));
        return -1;
    }

    return 0;

}
Ejemplo n.º 7
0
static void init_device_info(struct monitor* mon)
{
  gethostname(mon->hostname, 64);

  NVML_TRY(nvmlSystemGetDriverVersion(mon->driver_version,
                                      sizeof(mon->driver_version)));
  NVML_TRY(nvmlSystemGetNVMLVersion(mon->nvml_version,
                                    sizeof(mon->nvml_version)));

  NVML_TRY(nvmlDeviceGetCount(&mon->dev_count));

  mon->devices = calloc(mon->dev_count, sizeof(struct device));

  for(unsigned i = 0; i < mon->dev_count; ++i) {
    struct device dev;
    memset(&dev, 0, sizeof(struct device));

    dev.index = i;

    NVML_TRY(nvmlDeviceGetHandleByIndex(i, &dev.handle));

    NVML_TRY(nvmlDeviceGetName(dev.handle, dev.name, sizeof(dev.name)));
    NVML_TRY(nvmlDeviceGetSerial(dev.handle, dev.serial, sizeof(dev.serial)));
    NVML_TRY(nvmlDeviceGetUUID(dev.handle, dev.uuid, sizeof(dev.uuid)));

    NVML_TRY(nvmlDeviceGetPciInfo(dev.handle, &dev.pci));
    NVML_TRY(nvmlDeviceGetMemoryInfo(dev.handle, &dev.memory));

    unsigned long long event_types;
    NVML_TRY(nvmlEventSetCreate(&dev.event_set));
    if(0 == NVML_TRY(nvmlDeviceGetSupportedEventTypes(dev.handle, &event_types))) {
      NVML_TRY(nvmlDeviceRegisterEvents(dev.handle, event_types, dev.event_set));
    } else {
      dev.event_set = NULL;
    }

    for(nvmlClockType_t type = NVML_CLOCK_GRAPHICS; type < NVML_CLOCK_COUNT;
        ++type) {
      if(NVML_TRY(nvmlDeviceGetMaxClockInfo(dev.handle, type,
                                            &dev.max_clock[type])))
        break;
    }

    get_device_features(&dev);

    mon->devices[i] = dev;
  }

  mon->last_update = time(NULL);
}
Ejemplo n.º 8
0
	void CMeasureNVML<TSkipMs, TVariant>::read_memory_total(void *pMsMeasurement, int32_t& rThreadNum) {
		MS_MEASUREMENT_GPU *pMsMeasurementGpu = (MS_MEASUREMENT_GPU *) pMsMeasurement;
		nvmlReturn_t result;
		nvmlMemory_t memory;
		
		result = nvmlDeviceGetMemoryInfo(mDevice, &memory);
		if (NVML_SUCCESS != result) {
			mrLog.lock();
			mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread #" << rThreadNum << "): Error: cannot obtain memory informations. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			mrLog.unlock();
			exit(EXIT_FAILURE);
		}
		pMsMeasurementGpu->nvml_memory_total = (uint32_t)(memory.total >> 10);
	}
Ejemplo n.º 9
0
void _CUDAMALLOC(int sd,proc_data* data){

  printf("CUDAMALLOC\n");

  proc_data* sendProc;
  proc* p;
  size_t prevRegion;
  nvmlReturn_t res;
  nvmlMemory_t mem;
  int devp;

  sendProc = (proc_data*)malloc(sizeof(proc_data));
  p = get_proc(sd);

  prevRegion = p->data->mem;

  p->data = data;

  devp = data->pos;

  res = nvmlDeviceGetMemoryInfo(dem.devs[devp],&mem);

  if(res != NVML_SUCCESS){
    printf("Failed to get Memory Information\n");
    exit(-1);
  }

  if(mem.free > data->mem - prevRegion){

    printf("\tGOAHEAD\n");

    sendProc->REQUEST = CONNECT;
    send(sd,sendProc,sizeof(proc_data),0);

  }else{

    printf("\tOOPS\n");

    sendProc->REQUEST = SUSPEND;
    p->queued = 1;

    send(sd,sendProc,sizeof(proc_data),0);

    dequeueSpecifyProc(p);

  }
}
Ejemplo n.º 10
0
static void update_device_info(struct monitor* mon)
{
  // TODO: NVML is thread safe, and the order we grab GPU information
  // here doesn't particularly matter, so might as well take advantage
  // of parallelism here.

  unsigned i;

  for(i = 0; i < mon->dev_count; ++i) {
    struct device* dev = &mon->devices[i];

    if(dev->feature_support & MEMORY_INFO) {
      NVML_TRY(nvmlDeviceGetMemoryInfo(dev->handle, &dev->memory));
    }

    if(dev->feature_support & TEMPERATURE) {
      NVML_TRY(nvmlDeviceGetTemperature(dev->handle, NVML_TEMPERATURE_GPU,
                                        &dev->temperature));
    }

    if(dev->feature_support & POWER_USAGE) {
      NVML_TRY(nvmlDeviceGetPowerUsage(dev->handle, &dev->power_usage));
    }

    if(dev->feature_support & CLOCK_INFO) {
      for(nvmlClockType_t type = NVML_CLOCK_GRAPHICS; type < NVML_CLOCK_COUNT;
          ++type) {
        NVML_TRY(nvmlDeviceGetClockInfo(dev->handle, type, &dev->clock[type]));
      }
    }

    if(dev->feature_support & FAN_INFO) {
      NVML_TRY(nvmlDeviceGetFanSpeed(dev->handle, &dev->fan));
    }

    if(dev->event_set != NULL) {
      nvmlEventData_t data;

      NVML_TRY(nvmlEventSetWait(dev->event_set, &data, 1));

      // TODO: Do something with the returned information.
    }
  }

  mon->last_update = time(NULL);
}
Ejemplo n.º 11
0
		unsigned long long
getMemoryInfo( nvmlDevice_t dev, int which_one )
{
		nvmlMemory_t meminfo;
		nvmlReturn_t bad; 
		bad = nvmlDeviceGetMemoryInfo( dev, &meminfo );

		if ( NVML_SUCCESS != bad ) {
				SUBDBG( "something went wrong %s\n", nvmlErrorString(bad));
		}

		switch (which_one) {
				case MEMINFO_TOTAL_MEMORY:
						return meminfo.total;
				case MEMINFO_UNALLOCED:
						return meminfo.free;
				case MEMINFO_ALLOCED:
						return meminfo.used;
				default:
						;
		}
		return (unsigned long long)-1;
}
Ejemplo n.º 12
0
void _FIN(int sd){

  if(!has_proc_sd(sd)){

    cons* c;

    c = get_cons(sd);

    remove_cons(c);

    printf("CONS LEAVE\n");

    return;
  }

  dem.procCounter--;

  printf("FIN(SD:%d) (NUM OF PROCS:%d)\n",dem.procCounter);
  
  proc *p;
  int devNum,i;

  for(i = 0 ; i < dem.ndev ; i ++){
    if(dem.flags[i].sd == sd){
      printf("\tFIND FAILED PROC(SD:%d)\n",sd);
      dem.flags[i].flag = 0;
      dem.flags[i].sd   = 0;
    }
  }

  p = get_proc(sd);

  devNum = p->data->pos;

  printf("\tPID : %d\n",p->data->pid);

  if(!(p->data->flag&CANMIG)){

    printf("Staying proc @ %d finished.\n",devNum);

    dem.flags[devNum].stayed = 0;

    if(dem.staying_procs > 0){

      proc* stayp;

      stayp = (proc*)staying_proc();

      if(stayp == NULL){
	printf("oops ... failed to find staying proc...\n");
	exit(-1);
      }

      dem.staying_procs --;

      stayp->queued = ACTIVE;

      MSEND(stayp->sd,CONNECT,0,0,devNum,0,0);

      dem.flags[devNum].stayed = 1;
      
    }else{

      dequeueSpecifyDevNO(devNum);

    }
	
  }else{

    if(dem.flags[devNum].stayed){

      proc* ps;

      ps = (proc*)get_proc_staying_pos(devNum);

      if(ps == NULL){

	printf("PROC (CANNOT MIGRATE) IS ACTIVE NOW(%d)\n",devNum);

#if 0
	dequeueSpecifyDevNO(devNum);
#endif

      }else{

	nvmlReturn_t res;
	nvmlMemory_t mem;

	res = nvmlDeviceGetMemoryInfo(dem.devs[devNum],&mem);
      
	if(res != NVML_SUCCESS){
	  printf("Failed to get Memory Information\n");
	  exit(-1);
	}
      
	printf("USED : %u\n",ps->data->mem);
	printf("FREE : %u\n",mem.free);

	if(mem.free > ps->data->req + M64 + dem.flags[devNum].reserved){

	  MSEND(ps->sd,GOAHEAD,0,0,devNum,0,0);

	  ps->queued = ACTIVE;

	}
#if 0
	else{
	
	  dequeueSpecifyDevNO(devNum);
	
	}
#endif

      }

    }else{

      if(dem.staying_procs > 0){

	proc* ps;

	ps = staying_proc();

	if(ps == NULL){
	  printf("What ...? 1\n");
	  exit(-1);
	}

	MSEND(ps->sd,CONNECT,0,0,devNum,0,0);

	dem.staying_procs --;

	dem.flags[devNum].stayed = 1;

	ps->queued = ACTIVE;
	
      }else{

	dequeueSpecifyDevNO(devNum);

      }
    }
  }

  cons_remove(p);
    
  remove_proc(p);
}
Ejemplo n.º 13
0
void _CONNECT(int sd,proc_data* data){

  printf("CONNECT(%d) (QUEUESIZE   :%d)\n",sd,queue_size());
  printf("            (NUM OF PROCS:%d)\n",dem.procCounter);

  proc* p;
  int i;
  
  if(data->pos == -1){

    dem.procCounter++;
    
    p = create_proc(sd);
    
    add_proc(p);

    p->data = (proc_data*)malloc(sizeof(proc_data));
    memcpy(p->data,data,sizeof(proc_data));

    if(dem.procCounter > MAXPROC){
      
      if(!(p->data->flag&CANMIG)){

	dem.staying_procs ++;
	
	p->queued = STAYED_QUEUED;
	
      }else{

	p->queued = QUEUED;
	
      }

      return;

    }

  }else{

    p = get_proc(sd);
    memcpy(p->data,data,sizeof(proc_data));

  }

  printf("\tPID : %d\n",p->data->pid);

  size_t _mem;
  size_t _req;
  size_t _sym;
  nvmlReturn_t res;
  nvmlMemory_t mem;
  
  for(i = 0 ; i < dem.ndev ; i ++){
    
    if(!(p->data->flag&CANMIG))
      if(dem.flags[i].stayed)
	continue;

    _mem = p->data->mem;
    _req = p->data->req;
    _sym = p->data->sym;

    res = nvmlDeviceGetMemoryInfo(dem.devs[i],&mem);

#if 0
    printf("mem.free : %lu\n",mem.free);
    printf("reserved : %lu\n",dem.flags[i].reserved);
    printf("_mem     : %lu\n",_mem);
    printf("_req     : %lu\n",_req);
#endif

    if(p->data->pos == i){

      if(mem.free > _req + dem.flags[i].reserved + M64){

	printf("\tGOAHEAD(%d)\n",i);

	if(!(p->data->flag&CANMIG))
	  dem.flags[i].stayed = 1;
	
	dem.flags[i].reserved += _req;
	
	MSEND(sd,CONNECT,0,0,i,0,0);

	return ;
	
      }
      
    }else{
      
      if(mem.free > _mem + dem.flags[i].reserved + M64){
	
	printf("\tGOAHEAD(%d)*\n",i);
	
	if(!(p->data->flag&CANMIG))
	  dem.flags[i].stayed = 1;

	dem.flags[p->data->pos].reserved -= ( _mem - _req );
	dem.flags[i].reserved += _mem;

	p->data->pos = i;

	MSEND(sd,CONNECT,0,0,i,0,0);

	return ;
	
      }
    }
  }

  printf("\tOOPS\n");
  printf("mem.free : %lu\n",mem.free);
  printf("_req     : %lu\n",_req);

  if(!(p->data->flag&CANMIG)){

    dem.staying_procs ++;
    
    p->queued = STAYED_QUEUED;
    
    printf("Queued staying procs[%d]\n",dem.staying_procs);
    
  }else{

    p->queued = QUEUED;

    dem.flags[p->data->pos].reserved -= ( _mem -_req );
    
  }
}
Ejemplo n.º 14
0
void dequeueSpecifyDevNO(int devNum){

  if(dem.flags[devNum].flag)return;

  dem.flags[devNum].flag = 1;

  proc* ptemp;
  nvmlReturn_t res;
  nvmlMemory_t mem;
  int find;

  find = 0;
  
  ptemp = dem.p0->next;
  
  while(ptemp->next != NULL){

    if(ptemp->queued == QUEUED){
      
      res = nvmlDeviceGetMemoryInfo(dem.devs[devNum],&mem);
      
      if(res != NVML_SUCCESS){
	printf("Failed to get Memory Info in dequeue\n");
	exit(-1);
      }
      
      //      if(mem.free > ptemp->data->mem + M64 + dem.flags[devNum].reserved){
      if(mem.free > ptemp->data->mem + ptemp->data->req + M64 + dem.flags[devNum].reserved){
	
	dem.flags[devNum].sd = ptemp->sd;
	dem.flags[devNum].flag = 1;

	ptemp->queued = ACTIVE;

	if(ptemp->created_context){

	  dem.flags[devNum].reserved += ptemp->data->mem + ptemp->data->req;

	  MSEND(ptemp->sd,MIGRATE,0,0,devNum,0,0);

	}else{

	  dem.flags[devNum].reserved += ptemp->data->sym;

	  MSEND(ptemp->sd,CONNECT,0,0,devNum,0,0);

	  dem.flags[devNum].sd = -1;
	  dem.flags[devNum].flag = 0;

	}

	printf("MIGRATE(%d) to %d\n",ptemp->sd,devNum);

	find = 1;
	
	break;
      }
    }

    ptemp = ptemp->next;

  }

  if(!find){

    dem.flags[devNum].flag = 0;

  }
}
Ejemplo n.º 15
0
  int readNvmlCounters(HSP *sp, SFLHost_gpu_nvml *nvml) {
    unsigned int i;

    if(sp->nvml.gpu_count == 0) {
      return NO;
    }

    // pick up latest value of accumulators
    nvml->gpu_time = sp->nvml.nvml_gpu_time;
    nvml->mem_time = sp->nvml.nvml_mem_time;
    nvml->energy = sp->nvml.nvml_energy;

    // and fill in the rest of the counters/gauges too
    nvml->device_count = sp->nvml.gpu_count;

    // zero these, and sum across all GPUs
    nvml->mem_total = 0;
    nvml->mem_free = 0;
    nvml->ecc_errors = 0;
    nvml->processes = 0;

    // use the max across all GPUs
    nvml->temperature = 0;
    nvml->fan_speed = 0;

    for (i = 0; i < sp->nvml.gpu_count; ++i) {
      unsigned long long eccErrors;
      unsigned int temp;
      nvmlDevice_t gpu;
      unsigned int speed;
      unsigned int procs;
      nvmlMemory_t memInfo;
      nvmlReturn_t result;

      if (NVML_SUCCESS != nvmlDeviceGetHandleByIndex(i, &gpu)) {
        return NO;
      }
      if (NVML_SUCCESS == nvmlDeviceGetMemoryInfo(gpu, &memInfo)) {
        nvml->mem_total += memInfo.total;
        nvml->mem_free  += memInfo.free;
      }
      if (NVML_SUCCESS == nvmlDeviceGetTotalEccErrors(gpu, NVML_SINGLE_BIT_ECC, NVML_VOLATILE_ECC, &eccErrors)) {
        nvml->ecc_errors += eccErrors;
      }
      if (NVML_SUCCESS == nvmlDeviceGetTotalEccErrors(gpu, NVML_DOUBLE_BIT_ECC, NVML_VOLATILE_ECC, &eccErrors)) {
        nvml->ecc_errors += eccErrors;
      }
      if (NVML_SUCCESS == nvmlDeviceGetTemperature(gpu, NVML_TEMPERATURE_GPU, &temp)) {
        if (nvml->temperature < temp) {
          nvml->temperature = temp;
        }
      }
      if (NVML_SUCCESS == nvmlDeviceGetFanSpeed(gpu, &speed)) {
        if (nvml->fan_speed < speed) {
          nvml->fan_speed = speed;
        }
      }
      result = nvmlDeviceGetComputeRunningProcesses(gpu, &procs, NULL);
      if (NVML_SUCCESS == result || NVML_ERROR_INSUFFICIENT_SIZE == result) {
        nvml->processes += procs;
      }
    }

    return YES;
  }
Ejemplo n.º 16
0
	void CMeasureNVML<TSkipMs, TVariant>::init(void) {
		if(TVariant == VARIANT_FULL) {
			mrLog()
			<< ">>> 'nvml' (full version)" << std::endl;
		} else {
			mrLog()
			<< ">>> 'nvml' (light version)" << std::endl;
		}
		
		nvmlReturn_t result;
		int32_t rv;
		char const* args_set_pm[] = {"gpu_management", "-p 1", NULL};
		
		uint32_t device_count;
		char name[NVML_DEVICE_NAME_BUFFER_SIZE];
		nvmlPciInfo_t pci;
		nvmlEnableState_t mode;
		std::string modes[2] = {"disabled", "enabled"};
		std::stringstream clk_gpu_str;
		std::stringstream clk_mem_str;
		nvmlPstates_t power_state;
		nvmlMemory_t memory;
		
		const uint32_t count			= 32;
		uint32_t clk_mem_cnt			= count;
		uint32_t clk_mem[count];
		uint32_t clk_mem_max			= 0;
		uint32_t clk_mem_min			= 0xffffffff;
		uint32_t clk_mem_set			= 0;
		uint32_t clk_gpu_min_arr_cnt	= count;
		uint32_t clk_gpu_min_arr[clk_gpu_min_arr_cnt];
		uint32_t clk_gpu_min			= 0xffffffff;
		uint32_t clk_gpu_max_arr_cnt	= count;
		uint32_t clk_gpu_max_arr[clk_gpu_max_arr_cnt];
		uint32_t clk_gpu_max			= 0;
		uint32_t clk_gpu_set			= 0;
		uint32_t memory_total			= 0;
		
		result = nvmlInit();
		if (NVML_SUCCESS != result) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot initialize nvml library. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		result = nvmlDeviceGetCount(&device_count);
		if (NVML_SUCCESS != result) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot query device count. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		if (device_count > 1) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: this software has be rewritten if you want to support more than 1 device. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		mrLog() << ">>> 'nvml' (thread main): get gpu device handler...";
		mrLog.flush();
		
		result = nvmlDeviceGetHandleByIndex(0, &mDevice);
		if (NVML_SUCCESS != result) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot get device handler. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		mrLog() << " done!" << std::endl;
		
		result = nvmlDeviceGetName(mDevice, name, NVML_DEVICE_NAME_BUFFER_SIZE);
		if (NVML_SUCCESS != result) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot get device name. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		result = nvmlDeviceGetPciInfo(mDevice, &pci);
		if (NVML_SUCCESS != result) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot get pci information. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		result = nvmlDeviceGetPowerManagementMode(mDevice, &mode);
		if (NVML_SUCCESS != result) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: no power managment supported. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		result = nvmlDeviceGetPerformanceState(mDevice, &power_state);
		if (NVML_SUCCESS != result) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: no performance state reading possible. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		result = nvmlDeviceGetSupportedMemoryClocks(mDevice, &clk_mem_cnt, clk_mem);
		if (NVML_SUCCESS != result) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot obtain memory clock. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		for (int i=0; i<(int32_t)clk_mem_cnt; ++i) {
			clk_mem_min = (clk_mem[i]<clk_mem_min) ? clk_mem[i] : clk_mem_min;
			clk_mem_max = (clk_mem[i]>clk_mem_max) ? clk_mem[i] : clk_mem_max;
		}
		
		result = nvmlDeviceGetSupportedGraphicsClocks(mDevice, clk_mem_min, &clk_gpu_min_arr_cnt, clk_gpu_min_arr);
		if (NVML_SUCCESS != result) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot obtain graphics clock. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		for (int32_t i=0; i<(int32_t)clk_gpu_min_arr_cnt; ++i) {
			clk_gpu_min = (clk_gpu_min_arr[i]<clk_gpu_min) ? clk_gpu_min_arr[i] : clk_gpu_min;
		}
		
		result = nvmlDeviceGetSupportedGraphicsClocks(mDevice, clk_mem_max, &clk_gpu_max_arr_cnt, clk_gpu_max_arr);
		if (NVML_SUCCESS != result) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot obtain graphics clock. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		for (int32_t i=0; i<(int32_t)clk_gpu_max_arr_cnt; ++i) {
			clk_gpu_max = (clk_gpu_max_arr[i]>clk_gpu_max) ? clk_gpu_max_arr[i] : clk_gpu_max;
		}
		
		result = nvmlDeviceGetMemoryInfo(mDevice, &memory);
		if (NVML_SUCCESS != result) {
			mrLog.lock();
			mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread main): Error: cannot obtain memory informations. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			mrLog.unlock();
			exit(EXIT_FAILURE);
		}
		memory_total = (uint32_t)(memory.total >> 20);
		
		rv = exec_gpu_mgmt((char**)args_set_pm);
		if (rv) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: in gpu_management tool. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		mrLog()
		<< ">>> 'nvml' (thread main): persistence mode enabled." << std::endl;
		
		mrLog()
		<< ">>> 'nvml' (thread main):" << std::endl
		<< "     device         : " << name << std::endl
		<< "     pcie           : " << pci.busId << std::endl
		<< "     power mgmt mode: " << modes[mode] << std::endl
		<< "     power state cur: " << power_state << std::endl
		<< "     power state min: " << NVML_PSTATE_15 << std::endl
		<< "     power state max: " << NVML_PSTATE_0 << std::endl
		<< "     memory total   : " << memory_total << " MiB" << std::endl
		<< "     avail mem clks : ";
		for (int i=0; i<(int32_t)clk_mem_cnt; ++i) {
			if (i<(int32_t)clk_mem_cnt-1) {
				mrLog() << clk_mem[i] << " MHz, ";
			} else {
				mrLog() << clk_mem[i] << " MHz" << std::endl;
			}
		}
		
		mrLog()
		<< "     memory clk min : " << clk_mem_min << " MHz" << std::endl
		<< "     avail core clks: ";
		for (int32_t i=0; i<(int32_t)clk_gpu_min_arr_cnt; ++i) {
			if (i<(int32_t)clk_gpu_min_arr_cnt-1) {
				mrLog() << clk_gpu_min_arr[i] << " MHz, ";
			} else {
				mrLog() << clk_gpu_min_arr[i] << " MHz" << std::endl;
			}
		}
		mrLog()
		<< "     core clk min   : " << clk_gpu_min << " MHz" << std::endl;
		
		mrLog()
		<< "     memory clk max : " << clk_mem_max << " MHz" << std::endl
		<< "     avail core clks: ";
		for (int32_t i=0; i<(int32_t)clk_gpu_max_arr_cnt; ++i) {
			if (i<(int32_t)clk_gpu_max_arr_cnt-1) {
				mrLog() << clk_gpu_max_arr[i] << " MHz, ";
			} else {
				mrLog() << clk_gpu_max_arr[i] << " MHz" << std::endl;
			}
		}
		mrLog()
		<< "     core clk max   : " << clk_gpu_max << " MHz" << std::endl;
		
		switch (mGpuFrequency) {
			case GPU_FREQUENCY_MIN:
				clk_mem_set = clk_mem_min;
				clk_gpu_set = clk_gpu_min;
				break;
			case GPU_FREQUENCY_MAX:
				clk_mem_set = clk_mem_max;
				clk_gpu_set = clk_gpu_max;
				break;
			case GPU_FREQUENCY_CUR:
			default:
				clk_mem_set = 0;
				clk_gpu_set = 0;
				
				result = nvmlDeviceGetClockInfo(mDevice, NVML_CLOCK_MEM, &clk_mem_set);
				if (NVML_SUCCESS != result) {
					mrLog.lock();
					mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread main): Error: cannot read frequency. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
					mrLog.unlock();
					exit(EXIT_FAILURE);
				}
				
				result = nvmlDeviceGetClockInfo(mDevice, NVML_CLOCK_GRAPHICS, &clk_gpu_set);
				if (NVML_SUCCESS != result) {
					mrLog.lock();
					mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread main): Error: cannot read frequency. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
					mrLog.unlock();
					exit(EXIT_FAILURE);
				}
				
				break;
		}
		
		if (mGpuFrequency == GPU_FREQUENCY_MIN || mGpuFrequency == GPU_FREQUENCY_MAX) {
			// In these cases we actually set the GPU frequencies either to the maximum or minimum value.
			clk_gpu_str << "-c " << clk_gpu_set;
			clk_mem_str << "-m " << clk_mem_set;
			char const* args_set_clk[] = {"gpu_management", clk_gpu_str.str().c_str() , clk_mem_str.str().c_str(), NULL};
			rv = exec_gpu_mgmt((char**)args_set_clk);
			if (rv) {
				mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: in gpu_management tool. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
				exit(EXIT_FAILURE);
			}
						
			mrLog()
			<< ">>> 'nvml' (thread main): set core clk to " << clk_gpu_set << " MHz and mem clk to " << clk_mem_set << " MHz." << std::endl;
		} else {
			// We name the values *_set, but we don't set the frequency. We just print the current GPU frequency.
			mrLog()
			<< ">>> 'nvml' (thread main): current core clk is " << clk_gpu_set << " MHz and mem clk is " << clk_mem_set << " MHz." << std::endl;
		}
		mrLog()
		<< ">>> 'nvml' (thread main): wait for 15s to throttle gpu clocks." << std::endl;
		sleep(15);
		
		mrLog()
		<< ">>> 'nvml' (thread main): initialization done." << std::endl
		<< std::endl;
	}