// Build the set of device features static void get_device_features(struct device* dev) { if(nvmlDeviceGetTemperature(dev->handle, NVML_TEMPERATURE_GPU, &dev->temperature) == NVML_SUCCESS) { dev->feature_support |= TEMPERATURE; } if(nvmlDeviceGetMemoryInfo(dev->handle, &dev->memory) == NVML_SUCCESS) { dev->feature_support |= MEMORY_INFO; } if(nvmlDeviceGetPowerUsage(dev->handle, &dev->power_usage) == NVML_SUCCESS) { dev->feature_support |= POWER_USAGE; } if(nvmlDeviceGetClockInfo(dev->handle, NVML_CLOCK_GRAPHICS, &dev->clock[NVML_CLOCK_GRAPHICS]) == NVML_SUCCESS && nvmlDeviceGetClockInfo(dev->handle, NVML_CLOCK_SM, &dev->clock[NVML_CLOCK_SM]) == NVML_SUCCESS && nvmlDeviceGetClockInfo(dev->handle, NVML_CLOCK_COUNT, &dev->clock[NVML_CLOCK_COUNT]) == NVML_SUCCESS) { dev->feature_support |= CLOCK_INFO; } if(nvmlDeviceGetFanSpeed(dev->handle, &dev->fan) == NVML_SUCCESS) { dev->feature_support |= FAN_INFO; } if(nvmlDeviceGetUtilizationRates(dev->handle, &dev->util) == NVML_SUCCESS) { dev->feature_support |= UTILIZATION_INFO; } }
unsigned long long getFanSpeed( nvmlDevice_t dev ) { unsigned int ret = 0; nvmlReturn_t bad; bad = nvmlDeviceGetFanSpeed( dev, &ret ); if ( NVML_SUCCESS != bad ) { SUBDBG( "something went wrong %s\n", nvmlErrorString(bad)); } return (unsigned long long)ret; }
static void update_device_info(struct monitor* mon) { // TODO: NVML is thread safe, and the order we grab GPU information // here doesn't particularly matter, so might as well take advantage // of parallelism here. unsigned i; for(i = 0; i < mon->dev_count; ++i) { struct device* dev = &mon->devices[i]; if(dev->feature_support & MEMORY_INFO) { NVML_TRY(nvmlDeviceGetMemoryInfo(dev->handle, &dev->memory)); } if(dev->feature_support & TEMPERATURE) { NVML_TRY(nvmlDeviceGetTemperature(dev->handle, NVML_TEMPERATURE_GPU, &dev->temperature)); } if(dev->feature_support & POWER_USAGE) { NVML_TRY(nvmlDeviceGetPowerUsage(dev->handle, &dev->power_usage)); } if(dev->feature_support & CLOCK_INFO) { for(nvmlClockType_t type = NVML_CLOCK_GRAPHICS; type < NVML_CLOCK_COUNT; ++type) { NVML_TRY(nvmlDeviceGetClockInfo(dev->handle, type, &dev->clock[type])); } } if(dev->feature_support & FAN_INFO) { NVML_TRY(nvmlDeviceGetFanSpeed(dev->handle, &dev->fan)); } if(dev->event_set != NULL) { nvmlEventData_t data; NVML_TRY(nvmlEventSetWait(dev->event_set, &data, 1)); // TODO: Do something with the returned information. } } mon->last_update = time(NULL); }
int readNvmlCounters(HSP *sp, SFLHost_gpu_nvml *nvml) { unsigned int i; if(sp->nvml.gpu_count == 0) { return NO; } // pick up latest value of accumulators nvml->gpu_time = sp->nvml.nvml_gpu_time; nvml->mem_time = sp->nvml.nvml_mem_time; nvml->energy = sp->nvml.nvml_energy; // and fill in the rest of the counters/gauges too nvml->device_count = sp->nvml.gpu_count; // zero these, and sum across all GPUs nvml->mem_total = 0; nvml->mem_free = 0; nvml->ecc_errors = 0; nvml->processes = 0; // use the max across all GPUs nvml->temperature = 0; nvml->fan_speed = 0; for (i = 0; i < sp->nvml.gpu_count; ++i) { unsigned long long eccErrors; unsigned int temp; nvmlDevice_t gpu; unsigned int speed; unsigned int procs; nvmlMemory_t memInfo; nvmlReturn_t result; if (NVML_SUCCESS != nvmlDeviceGetHandleByIndex(i, &gpu)) { return NO; } if (NVML_SUCCESS == nvmlDeviceGetMemoryInfo(gpu, &memInfo)) { nvml->mem_total += memInfo.total; nvml->mem_free += memInfo.free; } if (NVML_SUCCESS == nvmlDeviceGetTotalEccErrors(gpu, NVML_SINGLE_BIT_ECC, NVML_VOLATILE_ECC, &eccErrors)) { nvml->ecc_errors += eccErrors; } if (NVML_SUCCESS == nvmlDeviceGetTotalEccErrors(gpu, NVML_DOUBLE_BIT_ECC, NVML_VOLATILE_ECC, &eccErrors)) { nvml->ecc_errors += eccErrors; } if (NVML_SUCCESS == nvmlDeviceGetTemperature(gpu, NVML_TEMPERATURE_GPU, &temp)) { if (nvml->temperature < temp) { nvml->temperature = temp; } } if (NVML_SUCCESS == nvmlDeviceGetFanSpeed(gpu, &speed)) { if (nvml->fan_speed < speed) { nvml->fan_speed = speed; } } result = nvmlDeviceGetComputeRunningProcesses(gpu, &procs, NULL); if (NVML_SUCCESS == result || NVML_ERROR_INSUFFICIENT_SIZE == result) { nvml->processes += procs; } } return YES; }