Esempio n. 1
0
// Build the set of device features
static void get_device_features(struct device* dev)
{
  if(nvmlDeviceGetTemperature(dev->handle, NVML_TEMPERATURE_GPU,
                              &dev->temperature) == NVML_SUCCESS) {
    dev->feature_support |= TEMPERATURE;
  }

  if(nvmlDeviceGetMemoryInfo(dev->handle, &dev->memory) == NVML_SUCCESS) {
    dev->feature_support |= MEMORY_INFO;
  }

  if(nvmlDeviceGetPowerUsage(dev->handle, &dev->power_usage) == NVML_SUCCESS) {
    dev->feature_support |= POWER_USAGE;
  }

  if(nvmlDeviceGetClockInfo(dev->handle, NVML_CLOCK_GRAPHICS,
                            &dev->clock[NVML_CLOCK_GRAPHICS]) == NVML_SUCCESS &&

     nvmlDeviceGetClockInfo(dev->handle, NVML_CLOCK_SM,
                            &dev->clock[NVML_CLOCK_SM]) == NVML_SUCCESS &&

     nvmlDeviceGetClockInfo(dev->handle, NVML_CLOCK_COUNT,
                            &dev->clock[NVML_CLOCK_COUNT]) == NVML_SUCCESS) {

    dev->feature_support |= CLOCK_INFO;
  }

  if(nvmlDeviceGetFanSpeed(dev->handle, &dev->fan) == NVML_SUCCESS) {
    dev->feature_support |= FAN_INFO;
  }

  if(nvmlDeviceGetUtilizationRates(dev->handle, &dev->util) == NVML_SUCCESS) {
    dev->feature_support |= UTILIZATION_INFO;
  }
}
Esempio n. 2
0
		unsigned long long 
getFanSpeed( nvmlDevice_t dev ) 
{
		unsigned int ret = 0;
		nvmlReturn_t bad; 
		bad = nvmlDeviceGetFanSpeed( dev, &ret );

		if ( NVML_SUCCESS != bad ) {
				SUBDBG( "something went wrong %s\n", nvmlErrorString(bad));
		}


		return (unsigned long long)ret; 
}
Esempio n. 3
0
static void update_device_info(struct monitor* mon)
{
  // TODO: NVML is thread safe, and the order we grab GPU information
  // here doesn't particularly matter, so might as well take advantage
  // of parallelism here.

  unsigned i;

  for(i = 0; i < mon->dev_count; ++i) {
    struct device* dev = &mon->devices[i];

    if(dev->feature_support & MEMORY_INFO) {
      NVML_TRY(nvmlDeviceGetMemoryInfo(dev->handle, &dev->memory));
    }

    if(dev->feature_support & TEMPERATURE) {
      NVML_TRY(nvmlDeviceGetTemperature(dev->handle, NVML_TEMPERATURE_GPU,
                                        &dev->temperature));
    }

    if(dev->feature_support & POWER_USAGE) {
      NVML_TRY(nvmlDeviceGetPowerUsage(dev->handle, &dev->power_usage));
    }

    if(dev->feature_support & CLOCK_INFO) {
      for(nvmlClockType_t type = NVML_CLOCK_GRAPHICS; type < NVML_CLOCK_COUNT;
          ++type) {
        NVML_TRY(nvmlDeviceGetClockInfo(dev->handle, type, &dev->clock[type]));
      }
    }

    if(dev->feature_support & FAN_INFO) {
      NVML_TRY(nvmlDeviceGetFanSpeed(dev->handle, &dev->fan));
    }

    if(dev->event_set != NULL) {
      nvmlEventData_t data;

      NVML_TRY(nvmlEventSetWait(dev->event_set, &data, 1));

      // TODO: Do something with the returned information.
    }
  }

  mon->last_update = time(NULL);
}
Esempio n. 4
0
  int readNvmlCounters(HSP *sp, SFLHost_gpu_nvml *nvml) {
    unsigned int i;

    if(sp->nvml.gpu_count == 0) {
      return NO;
    }

    // pick up latest value of accumulators
    nvml->gpu_time = sp->nvml.nvml_gpu_time;
    nvml->mem_time = sp->nvml.nvml_mem_time;
    nvml->energy = sp->nvml.nvml_energy;

    // and fill in the rest of the counters/gauges too
    nvml->device_count = sp->nvml.gpu_count;

    // zero these, and sum across all GPUs
    nvml->mem_total = 0;
    nvml->mem_free = 0;
    nvml->ecc_errors = 0;
    nvml->processes = 0;

    // use the max across all GPUs
    nvml->temperature = 0;
    nvml->fan_speed = 0;

    for (i = 0; i < sp->nvml.gpu_count; ++i) {
      unsigned long long eccErrors;
      unsigned int temp;
      nvmlDevice_t gpu;
      unsigned int speed;
      unsigned int procs;
      nvmlMemory_t memInfo;
      nvmlReturn_t result;

      if (NVML_SUCCESS != nvmlDeviceGetHandleByIndex(i, &gpu)) {
        return NO;
      }
      if (NVML_SUCCESS == nvmlDeviceGetMemoryInfo(gpu, &memInfo)) {
        nvml->mem_total += memInfo.total;
        nvml->mem_free  += memInfo.free;
      }
      if (NVML_SUCCESS == nvmlDeviceGetTotalEccErrors(gpu, NVML_SINGLE_BIT_ECC, NVML_VOLATILE_ECC, &eccErrors)) {
        nvml->ecc_errors += eccErrors;
      }
      if (NVML_SUCCESS == nvmlDeviceGetTotalEccErrors(gpu, NVML_DOUBLE_BIT_ECC, NVML_VOLATILE_ECC, &eccErrors)) {
        nvml->ecc_errors += eccErrors;
      }
      if (NVML_SUCCESS == nvmlDeviceGetTemperature(gpu, NVML_TEMPERATURE_GPU, &temp)) {
        if (nvml->temperature < temp) {
          nvml->temperature = temp;
        }
      }
      if (NVML_SUCCESS == nvmlDeviceGetFanSpeed(gpu, &speed)) {
        if (nvml->fan_speed < speed) {
          nvml->fan_speed = speed;
        }
      }
      result = nvmlDeviceGetComputeRunningProcesses(gpu, &procs, NULL);
      if (NVML_SUCCESS == result || NVML_ERROR_INSUFFICIENT_SIZE == result) {
        nvml->processes += procs;
      }
    }

    return YES;
  }