unsigned long long getTotalEccErrors( nvmlDevice_t dev, nvmlEccBitType_t bits) { unsigned long long counts = 0; nvmlReturn_t bad; bad = nvmlDeviceGetTotalEccErrors( dev, bits, NVML_VOLATILE_ECC , &counts); if ( NVML_SUCCESS != bad ) { SUBDBG( "something went wrong %s\n", nvmlErrorString(bad)); } return counts; }
int readNvmlCounters(HSP *sp, SFLHost_gpu_nvml *nvml) { unsigned int i; if(sp->nvml.gpu_count == 0) { return NO; } // pick up latest value of accumulators nvml->gpu_time = sp->nvml.nvml_gpu_time; nvml->mem_time = sp->nvml.nvml_mem_time; nvml->energy = sp->nvml.nvml_energy; // and fill in the rest of the counters/gauges too nvml->device_count = sp->nvml.gpu_count; // zero these, and sum across all GPUs nvml->mem_total = 0; nvml->mem_free = 0; nvml->ecc_errors = 0; nvml->processes = 0; // use the max across all GPUs nvml->temperature = 0; nvml->fan_speed = 0; for (i = 0; i < sp->nvml.gpu_count; ++i) { unsigned long long eccErrors; unsigned int temp; nvmlDevice_t gpu; unsigned int speed; unsigned int procs; nvmlMemory_t memInfo; nvmlReturn_t result; if (NVML_SUCCESS != nvmlDeviceGetHandleByIndex(i, &gpu)) { return NO; } if (NVML_SUCCESS == nvmlDeviceGetMemoryInfo(gpu, &memInfo)) { nvml->mem_total += memInfo.total; nvml->mem_free += memInfo.free; } if (NVML_SUCCESS == nvmlDeviceGetTotalEccErrors(gpu, NVML_SINGLE_BIT_ECC, NVML_VOLATILE_ECC, &eccErrors)) { nvml->ecc_errors += eccErrors; } if (NVML_SUCCESS == nvmlDeviceGetTotalEccErrors(gpu, NVML_DOUBLE_BIT_ECC, NVML_VOLATILE_ECC, &eccErrors)) { nvml->ecc_errors += eccErrors; } if (NVML_SUCCESS == nvmlDeviceGetTemperature(gpu, NVML_TEMPERATURE_GPU, &temp)) { if (nvml->temperature < temp) { nvml->temperature = temp; } } if (NVML_SUCCESS == nvmlDeviceGetFanSpeed(gpu, &speed)) { if (nvml->fan_speed < speed) { nvml->fan_speed = speed; } } result = nvmlDeviceGetComputeRunningProcesses(gpu, &procs, NULL); if (NVML_SUCCESS == result || NVML_ERROR_INSUFFICIENT_SIZE == result) { nvml->processes += procs; } } return YES; }