コード例 #1
0
  int readNvmlCounters(HSP *sp, SFLHost_gpu_nvml *nvml) {
    unsigned int i;

    if(sp->nvml.gpu_count == 0) {
      return NO;
    }

    // pick up latest value of accumulators
    nvml->gpu_time = sp->nvml.nvml_gpu_time;
    nvml->mem_time = sp->nvml.nvml_mem_time;
    nvml->energy = sp->nvml.nvml_energy;

    // and fill in the rest of the counters/gauges too
    nvml->device_count = sp->nvml.gpu_count;

    // zero these, and sum across all GPUs
    nvml->mem_total = 0;
    nvml->mem_free = 0;
    nvml->ecc_errors = 0;
    nvml->processes = 0;

    // use the max across all GPUs
    nvml->temperature = 0;
    nvml->fan_speed = 0;

    for (i = 0; i < sp->nvml.gpu_count; ++i) {
      unsigned long long eccErrors;
      unsigned int temp;
      nvmlDevice_t gpu;
      unsigned int speed;
      unsigned int procs;
      nvmlMemory_t memInfo;
      nvmlReturn_t result;

      if (NVML_SUCCESS != nvmlDeviceGetHandleByIndex(i, &gpu)) {
        return NO;
      }
      if (NVML_SUCCESS == nvmlDeviceGetMemoryInfo(gpu, &memInfo)) {
        nvml->mem_total += memInfo.total;
        nvml->mem_free  += memInfo.free;
      }
      if (NVML_SUCCESS == nvmlDeviceGetTotalEccErrors(gpu, NVML_SINGLE_BIT_ECC, NVML_VOLATILE_ECC, &eccErrors)) {
        nvml->ecc_errors += eccErrors;
      }
      if (NVML_SUCCESS == nvmlDeviceGetTotalEccErrors(gpu, NVML_DOUBLE_BIT_ECC, NVML_VOLATILE_ECC, &eccErrors)) {
        nvml->ecc_errors += eccErrors;
      }
      if (NVML_SUCCESS == nvmlDeviceGetTemperature(gpu, NVML_TEMPERATURE_GPU, &temp)) {
        if (nvml->temperature < temp) {
          nvml->temperature = temp;
        }
      }
      if (NVML_SUCCESS == nvmlDeviceGetFanSpeed(gpu, &speed)) {
        if (nvml->fan_speed < speed) {
          nvml->fan_speed = speed;
        }
      }
      result = nvmlDeviceGetComputeRunningProcesses(gpu, &procs, NULL);
      if (NVML_SUCCESS == result || NVML_ERROR_INSUFFICIENT_SIZE == result) {
        nvml->processes += procs;
      }
    }

    return YES;
  }
コード例 #2
0
ファイル: relnv.c プロジェクト: yyd01245/avencoder_nvidia
static int get_process_info(unsigned int*ncores,unsigned int *valarray)
{

    nvmlReturn_t ret;

    ret=nvmlInit();

    if(ret!=NVML_SUCCESS) {
        fprintf(stderr,"ERROR:: Initialize NVML{%s}..\n",nvmlErrorString(ret));
        return -1;
    }


    unsigned int c;

    ret=nvmlDeviceGetCount(&c);
    if(ret!=NVML_SUCCESS) {
        fprintf(stderr,"ERROR:: Device Get Count{%s}..\n",nvmlErrorString(ret));
        return -1;
    }

    *ncores=c;
    /*
        if(c!=NDEV){
            fprintf(stderr,"ERROR:: Current number of Cores is [%d],not %d....YOU NEED RECOMPILE THIS ROUTINE\n",c,NDEV);
            return -2;
        }
    */
    nvmlDevice_t devs[NDEV];

    nvmlProcessInfo_t pis[MAXPROC];


    int i;
    for(i=0; i<c; i++) {

        ret=nvmlDeviceGetHandleByIndex(i,&devs[i]);
        if(ret!=NVML_SUCCESS) {
            fprintf(stderr,"ERROR:: Device Get Handle{%s}..\n",nvmlErrorString(ret));
            return -1;
        }

        unsigned int np=MAXPROC;
        ret=nvmlDeviceGetComputeRunningProcesses(devs[i],&np,pis);
        if(ret!=NVML_SUCCESS) {
            fprintf(stderr,"ERROR:: GetRunningProcess{%s}..\n",nvmlErrorString(ret));
            return -1;
        }
        valarray[i]=np;

    }

    ret=nvmlShutdown();

    if(ret!=NVML_SUCCESS) {
        fprintf(stderr,"ERROR:: Shutdown NVML{%s}..\n",nvmlErrorString(ret));
        return -1;
    }

    return 0;


}