// Build the set of device features static void get_device_features(struct device* dev) { if(nvmlDeviceGetTemperature(dev->handle, NVML_TEMPERATURE_GPU, &dev->temperature) == NVML_SUCCESS) { dev->feature_support |= TEMPERATURE; } if(nvmlDeviceGetMemoryInfo(dev->handle, &dev->memory) == NVML_SUCCESS) { dev->feature_support |= MEMORY_INFO; } if(nvmlDeviceGetPowerUsage(dev->handle, &dev->power_usage) == NVML_SUCCESS) { dev->feature_support |= POWER_USAGE; } if(nvmlDeviceGetClockInfo(dev->handle, NVML_CLOCK_GRAPHICS, &dev->clock[NVML_CLOCK_GRAPHICS]) == NVML_SUCCESS && nvmlDeviceGetClockInfo(dev->handle, NVML_CLOCK_SM, &dev->clock[NVML_CLOCK_SM]) == NVML_SUCCESS && nvmlDeviceGetClockInfo(dev->handle, NVML_CLOCK_COUNT, &dev->clock[NVML_CLOCK_COUNT]) == NVML_SUCCESS) { dev->feature_support |= CLOCK_INFO; } if(nvmlDeviceGetFanSpeed(dev->handle, &dev->fan) == NVML_SUCCESS) { dev->feature_support |= FAN_INFO; } if(nvmlDeviceGetUtilizationRates(dev->handle, &dev->util) == NVML_SUCCESS) { dev->feature_support |= UTILIZATION_INFO; } }
void CMeasureNVML<TSkipMs, TVariant>::measure(void *pMsMeasurement, int32_t& rThreadNum) { nvmlReturn_t result; MS_MEASUREMENT_GPU *pMsMeasurementGpu = (MS_MEASUREMENT_GPU *) pMsMeasurement; result = nvmlDeviceGetPowerUsage(mDevice, &(pMsMeasurementGpu->nvml_power_cur)); if (NVML_SUCCESS != result) { mrLog.lock(); mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread #" << rThreadNum << "): Error: no power usage reading possible. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; mrLog.unlock(); exit(EXIT_FAILURE); } if(TVariant == VARIANT_FULL) { nvmlMemory_t memory; if(!(mMeasureCounter++ % TSkipMs)) { result = nvmlDeviceGetMemoryInfo(mDevice, &memory); if (NVML_SUCCESS != result) { mrLog.lock(); mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread #" << rThreadNum << "): Error: cannot obtain memory informations. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; mrLog.unlock(); exit(EXIT_FAILURE); } pMsMeasurementGpu->nvml_memory_free_cur = (uint32_t)(memory.free >> 10); pMsMeasurementGpu->nvml_memory_used_cur = (uint32_t)(memory.used >> 10); result = nvmlDeviceGetPerformanceState(mDevice, (nvmlPstates_t*)&(pMsMeasurementGpu->internal.nvml_power_state)); if (NVML_SUCCESS != result) { mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread #" << rThreadNum << "): Error: no performance state reading possible. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; exit(EXIT_FAILURE); } nvmlTemperatureSensors_t sensorType = NVML_TEMPERATURE_GPU; result = nvmlDeviceGetTemperature(mDevice, sensorType, &(pMsMeasurementGpu->nvml_temperature_cur)); if (NVML_SUCCESS != result) { mrLog.lock(); mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread #" << rThreadNum << "): Error: cannot read temperature. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; mrLog.unlock(); exit(EXIT_FAILURE); } result = nvmlDeviceGetClockInfo(mDevice, NVML_CLOCK_SM, &(pMsMeasurementGpu->nvml_clock_sm_cur)); if (NVML_SUCCESS != result) { mrLog.lock(); mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread #" << rThreadNum << "): Error: cannot read frequency. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; mrLog.unlock(); exit(EXIT_FAILURE); } result = nvmlDeviceGetClockInfo(mDevice, NVML_CLOCK_MEM, &(pMsMeasurementGpu->nvml_clock_mem_cur)); if (NVML_SUCCESS != result) { mrLog.lock(); mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread #" << rThreadNum << "): Error: cannot read frequency. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; mrLog.unlock(); exit(EXIT_FAILURE); } }
unsigned long long getTemperature( nvmlDevice_t dev ) { unsigned int ret = 0; nvmlReturn_t bad; bad = nvmlDeviceGetTemperature( dev, NVML_TEMPERATURE_GPU, &ret ); if ( NVML_SUCCESS != bad ) { SUBDBG( "something went wrong %s\n", nvmlErrorString(bad)); } return (unsigned long long)ret; }
static void update_device_info(struct monitor* mon) { // TODO: NVML is thread safe, and the order we grab GPU information // here doesn't particularly matter, so might as well take advantage // of parallelism here. unsigned i; for(i = 0; i < mon->dev_count; ++i) { struct device* dev = &mon->devices[i]; if(dev->feature_support & MEMORY_INFO) { NVML_TRY(nvmlDeviceGetMemoryInfo(dev->handle, &dev->memory)); } if(dev->feature_support & TEMPERATURE) { NVML_TRY(nvmlDeviceGetTemperature(dev->handle, NVML_TEMPERATURE_GPU, &dev->temperature)); } if(dev->feature_support & POWER_USAGE) { NVML_TRY(nvmlDeviceGetPowerUsage(dev->handle, &dev->power_usage)); } if(dev->feature_support & CLOCK_INFO) { for(nvmlClockType_t type = NVML_CLOCK_GRAPHICS; type < NVML_CLOCK_COUNT; ++type) { NVML_TRY(nvmlDeviceGetClockInfo(dev->handle, type, &dev->clock[type])); } } if(dev->feature_support & FAN_INFO) { NVML_TRY(nvmlDeviceGetFanSpeed(dev->handle, &dev->fan)); } if(dev->event_set != NULL) { nvmlEventData_t data; NVML_TRY(nvmlEventSetWait(dev->event_set, &data, 1)); // TODO: Do something with the returned information. } } mon->last_update = time(NULL); }
void update_temperature(void) { #if (ENABLE_NVML==1) unsigned int deviceCount; NVML_CHECK(nvmlDeviceGetCount( &deviceCount )); for( unsigned int devIdx = 0; devIdx < deviceCount; ++devIdx ) { nvmlDevice_t devHandle; NVML_CHECK(nvmlDeviceGetHandleByIndex( devIdx, &devHandle )); unsigned int devTemperature; NVML_CHECK(nvmlDeviceGetTemperature( devHandle, NVML_TEMPERATURE_GPU, &devTemperature )); gpu_temp[devIdx] = devTemperature; DEBUG_PRINTF("temperature updated: (gpu %d) %d \n", devIdx, devTemperature); } #endif }
int readNvmlCounters(HSP *sp, SFLHost_gpu_nvml *nvml) { unsigned int i; if(sp->nvml.gpu_count == 0) { return NO; } // pick up latest value of accumulators nvml->gpu_time = sp->nvml.nvml_gpu_time; nvml->mem_time = sp->nvml.nvml_mem_time; nvml->energy = sp->nvml.nvml_energy; // and fill in the rest of the counters/gauges too nvml->device_count = sp->nvml.gpu_count; // zero these, and sum across all GPUs nvml->mem_total = 0; nvml->mem_free = 0; nvml->ecc_errors = 0; nvml->processes = 0; // use the max across all GPUs nvml->temperature = 0; nvml->fan_speed = 0; for (i = 0; i < sp->nvml.gpu_count; ++i) { unsigned long long eccErrors; unsigned int temp; nvmlDevice_t gpu; unsigned int speed; unsigned int procs; nvmlMemory_t memInfo; nvmlReturn_t result; if (NVML_SUCCESS != nvmlDeviceGetHandleByIndex(i, &gpu)) { return NO; } if (NVML_SUCCESS == nvmlDeviceGetMemoryInfo(gpu, &memInfo)) { nvml->mem_total += memInfo.total; nvml->mem_free += memInfo.free; } if (NVML_SUCCESS == nvmlDeviceGetTotalEccErrors(gpu, NVML_SINGLE_BIT_ECC, NVML_VOLATILE_ECC, &eccErrors)) { nvml->ecc_errors += eccErrors; } if (NVML_SUCCESS == nvmlDeviceGetTotalEccErrors(gpu, NVML_DOUBLE_BIT_ECC, NVML_VOLATILE_ECC, &eccErrors)) { nvml->ecc_errors += eccErrors; } if (NVML_SUCCESS == nvmlDeviceGetTemperature(gpu, NVML_TEMPERATURE_GPU, &temp)) { if (nvml->temperature < temp) { nvml->temperature = temp; } } if (NVML_SUCCESS == nvmlDeviceGetFanSpeed(gpu, &speed)) { if (nvml->fan_speed < speed) { nvml->fan_speed = speed; } } result = nvmlDeviceGetComputeRunningProcesses(gpu, &procs, NULL); if (NVML_SUCCESS == result || NVML_ERROR_INSUFFICIENT_SIZE == result) { nvml->processes += procs; } } return YES; }