void _CUDAMALLOC(int sd,proc_data* data){ printf("CUDAMALLOC(%d)\n",sd); proc* p; nvmlReturn_t res; nvmlMemory_t mem; int devp; p = get_proc(sd); memcpy(p->data,data,sizeof(proc_data)); devp = data->pos; printf("DEVPOS : %d\n",data->pos); printf("\tPID : %d\n",p->data->pid); res = nvmlDeviceGetMemoryInfo(dem.devs[devp],&mem); if(res != NVML_SUCCESS){ printf("Failed to get Memory Information\n"); exit(-1); } if(mem.free > p->data->req + M64 + dem.flags[devp].reserved){ printf("\tGOAHEAD (REQ : %lu[MB])\n",p->data->req >> 20); MSEND(sd,GOAHEAD,0,0,0,0,0); dem.flags[devp].reserved += p->data->req; }else{
// Build the set of device features static void get_device_features(struct device* dev) { if(nvmlDeviceGetTemperature(dev->handle, NVML_TEMPERATURE_GPU, &dev->temperature) == NVML_SUCCESS) { dev->feature_support |= TEMPERATURE; } if(nvmlDeviceGetMemoryInfo(dev->handle, &dev->memory) == NVML_SUCCESS) { dev->feature_support |= MEMORY_INFO; } if(nvmlDeviceGetPowerUsage(dev->handle, &dev->power_usage) == NVML_SUCCESS) { dev->feature_support |= POWER_USAGE; } if(nvmlDeviceGetClockInfo(dev->handle, NVML_CLOCK_GRAPHICS, &dev->clock[NVML_CLOCK_GRAPHICS]) == NVML_SUCCESS && nvmlDeviceGetClockInfo(dev->handle, NVML_CLOCK_SM, &dev->clock[NVML_CLOCK_SM]) == NVML_SUCCESS && nvmlDeviceGetClockInfo(dev->handle, NVML_CLOCK_COUNT, &dev->clock[NVML_CLOCK_COUNT]) == NVML_SUCCESS) { dev->feature_support |= CLOCK_INFO; } if(nvmlDeviceGetFanSpeed(dev->handle, &dev->fan) == NVML_SUCCESS) { dev->feature_support |= FAN_INFO; } if(nvmlDeviceGetUtilizationRates(dev->handle, &dev->util) == NVML_SUCCESS) { dev->feature_support |= UTILIZATION_INFO; } }
void dequeueSpecifyProc(proc* p){ if(p == NULL || p->queued != QUEUED)return; nvmlReturn_t res; nvmlMemory_t mem; int devPos; for(devPos = 0 ; devPos < dem.ndev ; devPos ++){ if(dem.flags[devPos].flag){ printf("\tdevice[%d] cannot be selected...\n",devPos); continue; } dem.flags[devPos].flag = 1; res = nvmlDeviceGetMemoryInfo(dem.devs[devPos],&mem); if(res != NVML_SUCCESS){ printf("Failed to get Memory Info in dequeue\n"); exit(-1); } // if(mem.free > p->data->mem + M64 + dem.flags[devPos].reserved){ if(mem.free > p->data->mem + p->data->req + M64 + dem.flags[devPos].reserved){ dem.flags[devPos].sd = p->sd; dem.flags[devPos].flag = 1; p->queued = ACTIVE; if(p->created_context){ dem.flags[devPos].reserved += p->data->mem + p->data->req; MSEND(p->sd,MIGRATE,0,0,devPos,0,0); }else{ dem.flags[devPos].reserved += p->data->sym; MSEND(p->sd,CONNECT,0,0,devPos,0,0); dem.flags[devPos].sd = -1; dem.flags[devPos].flag = 0; } printf("MIGRATE(%d) to %d\n",p->sd,devPos); return; }else{ dem.flags[devPos].flag = 0; } } }
int probe_gpustats(devstat**stats) { unsigned int n_dev; nvmlReturn_t nvret; nvret=nvmlInit(); CHK_NVML(nvret,"Init NVML"); nvret=nvmlDeviceGetCount(&n_dev); CHK_NVML(nvret,"getCount"); *stats=(devstat*)calloc(n_dev,sizeof(devstat)); devstat*pstats=*stats; int i; for(i=0;i<n_dev;i++) nvmlDeviceGetHandleByIndex(i,&pstats[i].handler); for(i=0;i<n_dev;i++) nvmlDeviceGetMemoryInfo(pstats[i].handler,&pstats[i].meminfo); for(i=0;i<n_dev;i++) nvmlDeviceGetUtilizationRates(pstats[i].handler,&pstats[i].utils); unsigned int sampp; for(i=0;i<n_dev;i++) nvmlDeviceGetEncoderUtilization(pstats[i].handler,&pstats[i].encutil,&sampp); for(i=0;i<n_dev;i++) nvmlDeviceGetDecoderUtilization(pstats[i].handler,&pstats[i].decutil,&sampp); #if 0 int maxfreeind=0; int maxfree=0; for(i=0;i<n_dev;i++){ print_devstats(&pstats[i]); int free=pstats[i].meminfo.free; // fprintf(stderr,"<%d\n",free); if(free>maxfree){ maxfree=free; maxfreeind=i; } } #endif nvret=nvmlShutdown(); CHK_NVML(nvret,"Shutdown NVML"); return n_dev; }
void CMeasureNVML<TSkipMs, TVariant>::measure(void *pMsMeasurement, int32_t& rThreadNum) { nvmlReturn_t result; MS_MEASUREMENT_GPU *pMsMeasurementGpu = (MS_MEASUREMENT_GPU *) pMsMeasurement; result = nvmlDeviceGetPowerUsage(mDevice, &(pMsMeasurementGpu->nvml_power_cur)); if (NVML_SUCCESS != result) { mrLog.lock(); mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread #" << rThreadNum << "): Error: no power usage reading possible. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; mrLog.unlock(); exit(EXIT_FAILURE); } if(TVariant == VARIANT_FULL) { nvmlMemory_t memory; if(!(mMeasureCounter++ % TSkipMs)) { result = nvmlDeviceGetMemoryInfo(mDevice, &memory); if (NVML_SUCCESS != result) { mrLog.lock(); mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread #" << rThreadNum << "): Error: cannot obtain memory informations. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; mrLog.unlock(); exit(EXIT_FAILURE); } pMsMeasurementGpu->nvml_memory_free_cur = (uint32_t)(memory.free >> 10); pMsMeasurementGpu->nvml_memory_used_cur = (uint32_t)(memory.used >> 10); result = nvmlDeviceGetPerformanceState(mDevice, (nvmlPstates_t*)&(pMsMeasurementGpu->internal.nvml_power_state)); if (NVML_SUCCESS != result) { mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread #" << rThreadNum << "): Error: no performance state reading possible. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; exit(EXIT_FAILURE); } nvmlTemperatureSensors_t sensorType = NVML_TEMPERATURE_GPU; result = nvmlDeviceGetTemperature(mDevice, sensorType, &(pMsMeasurementGpu->nvml_temperature_cur)); if (NVML_SUCCESS != result) { mrLog.lock(); mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread #" << rThreadNum << "): Error: cannot read temperature. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; mrLog.unlock(); exit(EXIT_FAILURE); } result = nvmlDeviceGetClockInfo(mDevice, NVML_CLOCK_SM, &(pMsMeasurementGpu->nvml_clock_sm_cur)); if (NVML_SUCCESS != result) { mrLog.lock(); mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread #" << rThreadNum << "): Error: cannot read frequency. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; mrLog.unlock(); exit(EXIT_FAILURE); } result = nvmlDeviceGetClockInfo(mDevice, NVML_CLOCK_MEM, &(pMsMeasurementGpu->nvml_clock_mem_cur)); if (NVML_SUCCESS != result) { mrLog.lock(); mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread #" << rThreadNum << "): Error: cannot read frequency. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; mrLog.unlock(); exit(EXIT_FAILURE); } }
static int get_mem_info(unsigned int*ncores,unsigned int*usedarray) { nvmlReturn_t ret; ret=nvmlInit(); if(ret!=NVML_SUCCESS) { fprintf(stderr,"ERROR:: Initialize NVML{%s}..\n",nvmlErrorString(ret)); return -1; } unsigned int c; ret=nvmlDeviceGetCount(&c); if(ret!=NVML_SUCCESS) { fprintf(stderr,"ERROR:: Device Get Count{%s}..\n",nvmlErrorString(ret)); return -1; } *ncores=c; nvmlDevice_t devs[NDEV]; nvmlMemory_t meminfo; int i; for(i=0; i<c; i++) { ret=nvmlDeviceGetHandleByIndex(i,&devs[i]); if(ret!=NVML_SUCCESS) { fprintf(stderr,"ERROR:: Device Get Handle{%s}..\n",nvmlErrorString(ret)); return -1; } ret=nvmlDeviceGetMemoryInfo(devs[i],&meminfo); if(ret!=NVML_SUCCESS) { fprintf(stderr,"ERROR:: GetMemoryInfo{%s}..\n",nvmlErrorString(ret)); return -1; } usedarray[i]=meminfo.used; } ret=nvmlShutdown(); if(ret!=NVML_SUCCESS) { fprintf(stderr,"ERROR:: Shutdown NVML{%s}..\n",nvmlErrorString(ret)); return -1; } return 0; }
static void init_device_info(struct monitor* mon) { gethostname(mon->hostname, 64); NVML_TRY(nvmlSystemGetDriverVersion(mon->driver_version, sizeof(mon->driver_version))); NVML_TRY(nvmlSystemGetNVMLVersion(mon->nvml_version, sizeof(mon->nvml_version))); NVML_TRY(nvmlDeviceGetCount(&mon->dev_count)); mon->devices = calloc(mon->dev_count, sizeof(struct device)); for(unsigned i = 0; i < mon->dev_count; ++i) { struct device dev; memset(&dev, 0, sizeof(struct device)); dev.index = i; NVML_TRY(nvmlDeviceGetHandleByIndex(i, &dev.handle)); NVML_TRY(nvmlDeviceGetName(dev.handle, dev.name, sizeof(dev.name))); NVML_TRY(nvmlDeviceGetSerial(dev.handle, dev.serial, sizeof(dev.serial))); NVML_TRY(nvmlDeviceGetUUID(dev.handle, dev.uuid, sizeof(dev.uuid))); NVML_TRY(nvmlDeviceGetPciInfo(dev.handle, &dev.pci)); NVML_TRY(nvmlDeviceGetMemoryInfo(dev.handle, &dev.memory)); unsigned long long event_types; NVML_TRY(nvmlEventSetCreate(&dev.event_set)); if(0 == NVML_TRY(nvmlDeviceGetSupportedEventTypes(dev.handle, &event_types))) { NVML_TRY(nvmlDeviceRegisterEvents(dev.handle, event_types, dev.event_set)); } else { dev.event_set = NULL; } for(nvmlClockType_t type = NVML_CLOCK_GRAPHICS; type < NVML_CLOCK_COUNT; ++type) { if(NVML_TRY(nvmlDeviceGetMaxClockInfo(dev.handle, type, &dev.max_clock[type]))) break; } get_device_features(&dev); mon->devices[i] = dev; } mon->last_update = time(NULL); }
void CMeasureNVML<TSkipMs, TVariant>::read_memory_total(void *pMsMeasurement, int32_t& rThreadNum) { MS_MEASUREMENT_GPU *pMsMeasurementGpu = (MS_MEASUREMENT_GPU *) pMsMeasurement; nvmlReturn_t result; nvmlMemory_t memory; result = nvmlDeviceGetMemoryInfo(mDevice, &memory); if (NVML_SUCCESS != result) { mrLog.lock(); mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread #" << rThreadNum << "): Error: cannot obtain memory informations. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; mrLog.unlock(); exit(EXIT_FAILURE); } pMsMeasurementGpu->nvml_memory_total = (uint32_t)(memory.total >> 10); }
void _CUDAMALLOC(int sd,proc_data* data){ printf("CUDAMALLOC\n"); proc_data* sendProc; proc* p; size_t prevRegion; nvmlReturn_t res; nvmlMemory_t mem; int devp; sendProc = (proc_data*)malloc(sizeof(proc_data)); p = get_proc(sd); prevRegion = p->data->mem; p->data = data; devp = data->pos; res = nvmlDeviceGetMemoryInfo(dem.devs[devp],&mem); if(res != NVML_SUCCESS){ printf("Failed to get Memory Information\n"); exit(-1); } if(mem.free > data->mem - prevRegion){ printf("\tGOAHEAD\n"); sendProc->REQUEST = CONNECT; send(sd,sendProc,sizeof(proc_data),0); }else{ printf("\tOOPS\n"); sendProc->REQUEST = SUSPEND; p->queued = 1; send(sd,sendProc,sizeof(proc_data),0); dequeueSpecifyProc(p); } }
static void update_device_info(struct monitor* mon) { // TODO: NVML is thread safe, and the order we grab GPU information // here doesn't particularly matter, so might as well take advantage // of parallelism here. unsigned i; for(i = 0; i < mon->dev_count; ++i) { struct device* dev = &mon->devices[i]; if(dev->feature_support & MEMORY_INFO) { NVML_TRY(nvmlDeviceGetMemoryInfo(dev->handle, &dev->memory)); } if(dev->feature_support & TEMPERATURE) { NVML_TRY(nvmlDeviceGetTemperature(dev->handle, NVML_TEMPERATURE_GPU, &dev->temperature)); } if(dev->feature_support & POWER_USAGE) { NVML_TRY(nvmlDeviceGetPowerUsage(dev->handle, &dev->power_usage)); } if(dev->feature_support & CLOCK_INFO) { for(nvmlClockType_t type = NVML_CLOCK_GRAPHICS; type < NVML_CLOCK_COUNT; ++type) { NVML_TRY(nvmlDeviceGetClockInfo(dev->handle, type, &dev->clock[type])); } } if(dev->feature_support & FAN_INFO) { NVML_TRY(nvmlDeviceGetFanSpeed(dev->handle, &dev->fan)); } if(dev->event_set != NULL) { nvmlEventData_t data; NVML_TRY(nvmlEventSetWait(dev->event_set, &data, 1)); // TODO: Do something with the returned information. } } mon->last_update = time(NULL); }
unsigned long long getMemoryInfo( nvmlDevice_t dev, int which_one ) { nvmlMemory_t meminfo; nvmlReturn_t bad; bad = nvmlDeviceGetMemoryInfo( dev, &meminfo ); if ( NVML_SUCCESS != bad ) { SUBDBG( "something went wrong %s\n", nvmlErrorString(bad)); } switch (which_one) { case MEMINFO_TOTAL_MEMORY: return meminfo.total; case MEMINFO_UNALLOCED: return meminfo.free; case MEMINFO_ALLOCED: return meminfo.used; default: ; } return (unsigned long long)-1; }
void _FIN(int sd){ if(!has_proc_sd(sd)){ cons* c; c = get_cons(sd); remove_cons(c); printf("CONS LEAVE\n"); return; } dem.procCounter--; printf("FIN(SD:%d) (NUM OF PROCS:%d)\n",dem.procCounter); proc *p; int devNum,i; for(i = 0 ; i < dem.ndev ; i ++){ if(dem.flags[i].sd == sd){ printf("\tFIND FAILED PROC(SD:%d)\n",sd); dem.flags[i].flag = 0; dem.flags[i].sd = 0; } } p = get_proc(sd); devNum = p->data->pos; printf("\tPID : %d\n",p->data->pid); if(!(p->data->flag&CANMIG)){ printf("Staying proc @ %d finished.\n",devNum); dem.flags[devNum].stayed = 0; if(dem.staying_procs > 0){ proc* stayp; stayp = (proc*)staying_proc(); if(stayp == NULL){ printf("oops ... failed to find staying proc...\n"); exit(-1); } dem.staying_procs --; stayp->queued = ACTIVE; MSEND(stayp->sd,CONNECT,0,0,devNum,0,0); dem.flags[devNum].stayed = 1; }else{ dequeueSpecifyDevNO(devNum); } }else{ if(dem.flags[devNum].stayed){ proc* ps; ps = (proc*)get_proc_staying_pos(devNum); if(ps == NULL){ printf("PROC (CANNOT MIGRATE) IS ACTIVE NOW(%d)\n",devNum); #if 0 dequeueSpecifyDevNO(devNum); #endif }else{ nvmlReturn_t res; nvmlMemory_t mem; res = nvmlDeviceGetMemoryInfo(dem.devs[devNum],&mem); if(res != NVML_SUCCESS){ printf("Failed to get Memory Information\n"); exit(-1); } printf("USED : %u\n",ps->data->mem); printf("FREE : %u\n",mem.free); if(mem.free > ps->data->req + M64 + dem.flags[devNum].reserved){ MSEND(ps->sd,GOAHEAD,0,0,devNum,0,0); ps->queued = ACTIVE; } #if 0 else{ dequeueSpecifyDevNO(devNum); } #endif } }else{ if(dem.staying_procs > 0){ proc* ps; ps = staying_proc(); if(ps == NULL){ printf("What ...? 1\n"); exit(-1); } MSEND(ps->sd,CONNECT,0,0,devNum,0,0); dem.staying_procs --; dem.flags[devNum].stayed = 1; ps->queued = ACTIVE; }else{ dequeueSpecifyDevNO(devNum); } } } cons_remove(p); remove_proc(p); }
void _CONNECT(int sd,proc_data* data){ printf("CONNECT(%d) (QUEUESIZE :%d)\n",sd,queue_size()); printf(" (NUM OF PROCS:%d)\n",dem.procCounter); proc* p; int i; if(data->pos == -1){ dem.procCounter++; p = create_proc(sd); add_proc(p); p->data = (proc_data*)malloc(sizeof(proc_data)); memcpy(p->data,data,sizeof(proc_data)); if(dem.procCounter > MAXPROC){ if(!(p->data->flag&CANMIG)){ dem.staying_procs ++; p->queued = STAYED_QUEUED; }else{ p->queued = QUEUED; } return; } }else{ p = get_proc(sd); memcpy(p->data,data,sizeof(proc_data)); } printf("\tPID : %d\n",p->data->pid); size_t _mem; size_t _req; size_t _sym; nvmlReturn_t res; nvmlMemory_t mem; for(i = 0 ; i < dem.ndev ; i ++){ if(!(p->data->flag&CANMIG)) if(dem.flags[i].stayed) continue; _mem = p->data->mem; _req = p->data->req; _sym = p->data->sym; res = nvmlDeviceGetMemoryInfo(dem.devs[i],&mem); #if 0 printf("mem.free : %lu\n",mem.free); printf("reserved : %lu\n",dem.flags[i].reserved); printf("_mem : %lu\n",_mem); printf("_req : %lu\n",_req); #endif if(p->data->pos == i){ if(mem.free > _req + dem.flags[i].reserved + M64){ printf("\tGOAHEAD(%d)\n",i); if(!(p->data->flag&CANMIG)) dem.flags[i].stayed = 1; dem.flags[i].reserved += _req; MSEND(sd,CONNECT,0,0,i,0,0); return ; } }else{ if(mem.free > _mem + dem.flags[i].reserved + M64){ printf("\tGOAHEAD(%d)*\n",i); if(!(p->data->flag&CANMIG)) dem.flags[i].stayed = 1; dem.flags[p->data->pos].reserved -= ( _mem - _req ); dem.flags[i].reserved += _mem; p->data->pos = i; MSEND(sd,CONNECT,0,0,i,0,0); return ; } } } printf("\tOOPS\n"); printf("mem.free : %lu\n",mem.free); printf("_req : %lu\n",_req); if(!(p->data->flag&CANMIG)){ dem.staying_procs ++; p->queued = STAYED_QUEUED; printf("Queued staying procs[%d]\n",dem.staying_procs); }else{ p->queued = QUEUED; dem.flags[p->data->pos].reserved -= ( _mem -_req ); } }
void dequeueSpecifyDevNO(int devNum){ if(dem.flags[devNum].flag)return; dem.flags[devNum].flag = 1; proc* ptemp; nvmlReturn_t res; nvmlMemory_t mem; int find; find = 0; ptemp = dem.p0->next; while(ptemp->next != NULL){ if(ptemp->queued == QUEUED){ res = nvmlDeviceGetMemoryInfo(dem.devs[devNum],&mem); if(res != NVML_SUCCESS){ printf("Failed to get Memory Info in dequeue\n"); exit(-1); } // if(mem.free > ptemp->data->mem + M64 + dem.flags[devNum].reserved){ if(mem.free > ptemp->data->mem + ptemp->data->req + M64 + dem.flags[devNum].reserved){ dem.flags[devNum].sd = ptemp->sd; dem.flags[devNum].flag = 1; ptemp->queued = ACTIVE; if(ptemp->created_context){ dem.flags[devNum].reserved += ptemp->data->mem + ptemp->data->req; MSEND(ptemp->sd,MIGRATE,0,0,devNum,0,0); }else{ dem.flags[devNum].reserved += ptemp->data->sym; MSEND(ptemp->sd,CONNECT,0,0,devNum,0,0); dem.flags[devNum].sd = -1; dem.flags[devNum].flag = 0; } printf("MIGRATE(%d) to %d\n",ptemp->sd,devNum); find = 1; break; } } ptemp = ptemp->next; } if(!find){ dem.flags[devNum].flag = 0; } }
int readNvmlCounters(HSP *sp, SFLHost_gpu_nvml *nvml) { unsigned int i; if(sp->nvml.gpu_count == 0) { return NO; } // pick up latest value of accumulators nvml->gpu_time = sp->nvml.nvml_gpu_time; nvml->mem_time = sp->nvml.nvml_mem_time; nvml->energy = sp->nvml.nvml_energy; // and fill in the rest of the counters/gauges too nvml->device_count = sp->nvml.gpu_count; // zero these, and sum across all GPUs nvml->mem_total = 0; nvml->mem_free = 0; nvml->ecc_errors = 0; nvml->processes = 0; // use the max across all GPUs nvml->temperature = 0; nvml->fan_speed = 0; for (i = 0; i < sp->nvml.gpu_count; ++i) { unsigned long long eccErrors; unsigned int temp; nvmlDevice_t gpu; unsigned int speed; unsigned int procs; nvmlMemory_t memInfo; nvmlReturn_t result; if (NVML_SUCCESS != nvmlDeviceGetHandleByIndex(i, &gpu)) { return NO; } if (NVML_SUCCESS == nvmlDeviceGetMemoryInfo(gpu, &memInfo)) { nvml->mem_total += memInfo.total; nvml->mem_free += memInfo.free; } if (NVML_SUCCESS == nvmlDeviceGetTotalEccErrors(gpu, NVML_SINGLE_BIT_ECC, NVML_VOLATILE_ECC, &eccErrors)) { nvml->ecc_errors += eccErrors; } if (NVML_SUCCESS == nvmlDeviceGetTotalEccErrors(gpu, NVML_DOUBLE_BIT_ECC, NVML_VOLATILE_ECC, &eccErrors)) { nvml->ecc_errors += eccErrors; } if (NVML_SUCCESS == nvmlDeviceGetTemperature(gpu, NVML_TEMPERATURE_GPU, &temp)) { if (nvml->temperature < temp) { nvml->temperature = temp; } } if (NVML_SUCCESS == nvmlDeviceGetFanSpeed(gpu, &speed)) { if (nvml->fan_speed < speed) { nvml->fan_speed = speed; } } result = nvmlDeviceGetComputeRunningProcesses(gpu, &procs, NULL); if (NVML_SUCCESS == result || NVML_ERROR_INSUFFICIENT_SIZE == result) { nvml->processes += procs; } } return YES; }
void CMeasureNVML<TSkipMs, TVariant>::init(void) { if(TVariant == VARIANT_FULL) { mrLog() << ">>> 'nvml' (full version)" << std::endl; } else { mrLog() << ">>> 'nvml' (light version)" << std::endl; } nvmlReturn_t result; int32_t rv; char const* args_set_pm[] = {"gpu_management", "-p 1", NULL}; uint32_t device_count; char name[NVML_DEVICE_NAME_BUFFER_SIZE]; nvmlPciInfo_t pci; nvmlEnableState_t mode; std::string modes[2] = {"disabled", "enabled"}; std::stringstream clk_gpu_str; std::stringstream clk_mem_str; nvmlPstates_t power_state; nvmlMemory_t memory; const uint32_t count = 32; uint32_t clk_mem_cnt = count; uint32_t clk_mem[count]; uint32_t clk_mem_max = 0; uint32_t clk_mem_min = 0xffffffff; uint32_t clk_mem_set = 0; uint32_t clk_gpu_min_arr_cnt = count; uint32_t clk_gpu_min_arr[clk_gpu_min_arr_cnt]; uint32_t clk_gpu_min = 0xffffffff; uint32_t clk_gpu_max_arr_cnt = count; uint32_t clk_gpu_max_arr[clk_gpu_max_arr_cnt]; uint32_t clk_gpu_max = 0; uint32_t clk_gpu_set = 0; uint32_t memory_total = 0; result = nvmlInit(); if (NVML_SUCCESS != result) { mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot initialize nvml library. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; exit(EXIT_FAILURE); } result = nvmlDeviceGetCount(&device_count); if (NVML_SUCCESS != result) { mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot query device count. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; exit(EXIT_FAILURE); } if (device_count > 1) { mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: this software has be rewritten if you want to support more than 1 device. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; exit(EXIT_FAILURE); } mrLog() << ">>> 'nvml' (thread main): get gpu device handler..."; mrLog.flush(); result = nvmlDeviceGetHandleByIndex(0, &mDevice); if (NVML_SUCCESS != result) { mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot get device handler. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; exit(EXIT_FAILURE); } mrLog() << " done!" << std::endl; result = nvmlDeviceGetName(mDevice, name, NVML_DEVICE_NAME_BUFFER_SIZE); if (NVML_SUCCESS != result) { mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot get device name. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; exit(EXIT_FAILURE); } result = nvmlDeviceGetPciInfo(mDevice, &pci); if (NVML_SUCCESS != result) { mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot get pci information. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; exit(EXIT_FAILURE); } result = nvmlDeviceGetPowerManagementMode(mDevice, &mode); if (NVML_SUCCESS != result) { mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: no power managment supported. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; exit(EXIT_FAILURE); } result = nvmlDeviceGetPerformanceState(mDevice, &power_state); if (NVML_SUCCESS != result) { mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: no performance state reading possible. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; exit(EXIT_FAILURE); } result = nvmlDeviceGetSupportedMemoryClocks(mDevice, &clk_mem_cnt, clk_mem); if (NVML_SUCCESS != result) { mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot obtain memory clock. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; exit(EXIT_FAILURE); } for (int i=0; i<(int32_t)clk_mem_cnt; ++i) { clk_mem_min = (clk_mem[i]<clk_mem_min) ? clk_mem[i] : clk_mem_min; clk_mem_max = (clk_mem[i]>clk_mem_max) ? clk_mem[i] : clk_mem_max; } result = nvmlDeviceGetSupportedGraphicsClocks(mDevice, clk_mem_min, &clk_gpu_min_arr_cnt, clk_gpu_min_arr); if (NVML_SUCCESS != result) { mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot obtain graphics clock. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; exit(EXIT_FAILURE); } for (int32_t i=0; i<(int32_t)clk_gpu_min_arr_cnt; ++i) { clk_gpu_min = (clk_gpu_min_arr[i]<clk_gpu_min) ? clk_gpu_min_arr[i] : clk_gpu_min; } result = nvmlDeviceGetSupportedGraphicsClocks(mDevice, clk_mem_max, &clk_gpu_max_arr_cnt, clk_gpu_max_arr); if (NVML_SUCCESS != result) { mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot obtain graphics clock. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; exit(EXIT_FAILURE); } for (int32_t i=0; i<(int32_t)clk_gpu_max_arr_cnt; ++i) { clk_gpu_max = (clk_gpu_max_arr[i]>clk_gpu_max) ? clk_gpu_max_arr[i] : clk_gpu_max; } result = nvmlDeviceGetMemoryInfo(mDevice, &memory); if (NVML_SUCCESS != result) { mrLog.lock(); mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread main): Error: cannot obtain memory informations. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; mrLog.unlock(); exit(EXIT_FAILURE); } memory_total = (uint32_t)(memory.total >> 20); rv = exec_gpu_mgmt((char**)args_set_pm); if (rv) { mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: in gpu_management tool. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; exit(EXIT_FAILURE); } mrLog() << ">>> 'nvml' (thread main): persistence mode enabled." << std::endl; mrLog() << ">>> 'nvml' (thread main):" << std::endl << " device : " << name << std::endl << " pcie : " << pci.busId << std::endl << " power mgmt mode: " << modes[mode] << std::endl << " power state cur: " << power_state << std::endl << " power state min: " << NVML_PSTATE_15 << std::endl << " power state max: " << NVML_PSTATE_0 << std::endl << " memory total : " << memory_total << " MiB" << std::endl << " avail mem clks : "; for (int i=0; i<(int32_t)clk_mem_cnt; ++i) { if (i<(int32_t)clk_mem_cnt-1) { mrLog() << clk_mem[i] << " MHz, "; } else { mrLog() << clk_mem[i] << " MHz" << std::endl; } } mrLog() << " memory clk min : " << clk_mem_min << " MHz" << std::endl << " avail core clks: "; for (int32_t i=0; i<(int32_t)clk_gpu_min_arr_cnt; ++i) { if (i<(int32_t)clk_gpu_min_arr_cnt-1) { mrLog() << clk_gpu_min_arr[i] << " MHz, "; } else { mrLog() << clk_gpu_min_arr[i] << " MHz" << std::endl; } } mrLog() << " core clk min : " << clk_gpu_min << " MHz" << std::endl; mrLog() << " memory clk max : " << clk_mem_max << " MHz" << std::endl << " avail core clks: "; for (int32_t i=0; i<(int32_t)clk_gpu_max_arr_cnt; ++i) { if (i<(int32_t)clk_gpu_max_arr_cnt-1) { mrLog() << clk_gpu_max_arr[i] << " MHz, "; } else { mrLog() << clk_gpu_max_arr[i] << " MHz" << std::endl; } } mrLog() << " core clk max : " << clk_gpu_max << " MHz" << std::endl; switch (mGpuFrequency) { case GPU_FREQUENCY_MIN: clk_mem_set = clk_mem_min; clk_gpu_set = clk_gpu_min; break; case GPU_FREQUENCY_MAX: clk_mem_set = clk_mem_max; clk_gpu_set = clk_gpu_max; break; case GPU_FREQUENCY_CUR: default: clk_mem_set = 0; clk_gpu_set = 0; result = nvmlDeviceGetClockInfo(mDevice, NVML_CLOCK_MEM, &clk_mem_set); if (NVML_SUCCESS != result) { mrLog.lock(); mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread main): Error: cannot read frequency. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; mrLog.unlock(); exit(EXIT_FAILURE); } result = nvmlDeviceGetClockInfo(mDevice, NVML_CLOCK_GRAPHICS, &clk_gpu_set); if (NVML_SUCCESS != result) { mrLog.lock(); mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread main): Error: cannot read frequency. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; mrLog.unlock(); exit(EXIT_FAILURE); } break; } if (mGpuFrequency == GPU_FREQUENCY_MIN || mGpuFrequency == GPU_FREQUENCY_MAX) { // In these cases we actually set the GPU frequencies either to the maximum or minimum value. clk_gpu_str << "-c " << clk_gpu_set; clk_mem_str << "-m " << clk_mem_set; char const* args_set_clk[] = {"gpu_management", clk_gpu_str.str().c_str() , clk_mem_str.str().c_str(), NULL}; rv = exec_gpu_mgmt((char**)args_set_clk); if (rv) { mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: in gpu_management tool. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; exit(EXIT_FAILURE); } mrLog() << ">>> 'nvml' (thread main): set core clk to " << clk_gpu_set << " MHz and mem clk to " << clk_mem_set << " MHz." << std::endl; } else { // We name the values *_set, but we don't set the frequency. We just print the current GPU frequency. mrLog() << ">>> 'nvml' (thread main): current core clk is " << clk_gpu_set << " MHz and mem clk is " << clk_mem_set << " MHz." << std::endl; } mrLog() << ">>> 'nvml' (thread main): wait for 15s to throttle gpu clocks." << std::endl; sleep(15); mrLog() << ">>> 'nvml' (thread main): initialization done." << std::endl << std::endl; }