// Build the set of device features static void get_device_features(struct device* dev) { if(nvmlDeviceGetTemperature(dev->handle, NVML_TEMPERATURE_GPU, &dev->temperature) == NVML_SUCCESS) { dev->feature_support |= TEMPERATURE; } if(nvmlDeviceGetMemoryInfo(dev->handle, &dev->memory) == NVML_SUCCESS) { dev->feature_support |= MEMORY_INFO; } if(nvmlDeviceGetPowerUsage(dev->handle, &dev->power_usage) == NVML_SUCCESS) { dev->feature_support |= POWER_USAGE; } if(nvmlDeviceGetClockInfo(dev->handle, NVML_CLOCK_GRAPHICS, &dev->clock[NVML_CLOCK_GRAPHICS]) == NVML_SUCCESS && nvmlDeviceGetClockInfo(dev->handle, NVML_CLOCK_SM, &dev->clock[NVML_CLOCK_SM]) == NVML_SUCCESS && nvmlDeviceGetClockInfo(dev->handle, NVML_CLOCK_COUNT, &dev->clock[NVML_CLOCK_COUNT]) == NVML_SUCCESS) { dev->feature_support |= CLOCK_INFO; } if(nvmlDeviceGetFanSpeed(dev->handle, &dev->fan) == NVML_SUCCESS) { dev->feature_support |= FAN_INFO; } if(nvmlDeviceGetUtilizationRates(dev->handle, &dev->util) == NVML_SUCCESS) { dev->feature_support |= UTILIZATION_INFO; } }
int probe_gpustats(devstat**stats) { unsigned int n_dev; nvmlReturn_t nvret; nvret=nvmlInit(); CHK_NVML(nvret,"Init NVML"); nvret=nvmlDeviceGetCount(&n_dev); CHK_NVML(nvret,"getCount"); *stats=(devstat*)calloc(n_dev,sizeof(devstat)); devstat*pstats=*stats; int i; for(i=0;i<n_dev;i++) nvmlDeviceGetHandleByIndex(i,&pstats[i].handler); for(i=0;i<n_dev;i++) nvmlDeviceGetMemoryInfo(pstats[i].handler,&pstats[i].meminfo); for(i=0;i<n_dev;i++) nvmlDeviceGetUtilizationRates(pstats[i].handler,&pstats[i].utils); unsigned int sampp; for(i=0;i<n_dev;i++) nvmlDeviceGetEncoderUtilization(pstats[i].handler,&pstats[i].encutil,&sampp); for(i=0;i<n_dev;i++) nvmlDeviceGetDecoderUtilization(pstats[i].handler,&pstats[i].decutil,&sampp); #if 0 int maxfreeind=0; int maxfree=0; for(i=0;i<n_dev;i++){ print_devstats(&pstats[i]); int free=pstats[i].meminfo.free; // fprintf(stderr,"<%d\n",free); if(free>maxfree){ maxfree=free; maxfreeind=i; } } #endif nvret=nvmlShutdown(); CHK_NVML(nvret,"Shutdown NVML"); return n_dev; }
/* * Class: org_apache_hadoop_yarn_server_nodemanager_containermanager_launcher_GPUMonitor * Method: getState * Signature: ()I */ JNIEXPORT jint JNICALL Java_org_apache_hadoop_yarn_server_nodemanager_containermanager_launcher_GPUMonitor_getState (JNIEnv *, jobject) { unsigned int infoCount=-1; nvmlReturn_t result; result = nvmlDeviceGetComputeRunningProcesses(device , &infoCount, pinfos); // startTimer( &st_gur); nvmlUtilization_t utilization; nvmlDeviceGetUtilizationRates( device, &utilization); unsigned int return_value=0; // infoCount memory gpu // 0000 0000 0000 0000 0000 0000 0000 0000 return_value = (utilization.gpu ) & 0x000000FF; return_value |= (utilization.memory << 8) & 0x0000FF00; return_value |= (infoCount <<16) & 0x00FF0000; return return_value; }
/* 0 => gpu util 1 => memory util */ unsigned long long getUtilization( nvmlDevice_t dev, int which_one ) { nvmlUtilization_t util; nvmlReturn_t bad; bad = nvmlDeviceGetUtilizationRates( dev, &util ); if ( NVML_SUCCESS != bad ) { SUBDBG( "something went wrong %s\n", nvmlErrorString(bad)); } switch (which_one) { case GPU_UTILIZATION: return (unsigned long long) util.gpu; case MEMORY_UTILIZATION: return (unsigned long long) util.memory; default: ; } return (unsigned long long) -1; }
/*_________________---------------------------__________________ _________________ nvml_tick __________________ -----------------___________________________------------------ Called every second */ void nvml_tick(HSP *sp) { if(sp->nvml.gpu_count > 0) { unsigned int i; for (i = 0; i < sp->nvml.gpu_count; ++i) { nvmlDevice_t gpu; unsigned int power_mW; nvmlUtilization_t util; if (NVML_SUCCESS != nvmlDeviceGetHandleByIndex(i, &gpu)) { continue; } if (NVML_SUCCESS == nvmlDeviceGetUtilizationRates(gpu, &util)) { sp->nvml.nvml_gpu_time += util.gpu * 10; // accumulate as mS sp->nvml.nvml_mem_time += util.memory * 10; // accumulate as mS } if (NVML_SUCCESS == nvmlDeviceGetPowerUsage(gpu, &power_mW)) { sp->nvml.nvml_energy += power_mW; // accumulate as mJ } } } }