Example #1
0
// Build the set of device features
static void get_device_features(struct device* dev)
{
  if(nvmlDeviceGetTemperature(dev->handle, NVML_TEMPERATURE_GPU,
                              &dev->temperature) == NVML_SUCCESS) {
    dev->feature_support |= TEMPERATURE;
  }

  if(nvmlDeviceGetMemoryInfo(dev->handle, &dev->memory) == NVML_SUCCESS) {
    dev->feature_support |= MEMORY_INFO;
  }

  if(nvmlDeviceGetPowerUsage(dev->handle, &dev->power_usage) == NVML_SUCCESS) {
    dev->feature_support |= POWER_USAGE;
  }

  if(nvmlDeviceGetClockInfo(dev->handle, NVML_CLOCK_GRAPHICS,
                            &dev->clock[NVML_CLOCK_GRAPHICS]) == NVML_SUCCESS &&

     nvmlDeviceGetClockInfo(dev->handle, NVML_CLOCK_SM,
                            &dev->clock[NVML_CLOCK_SM]) == NVML_SUCCESS &&

     nvmlDeviceGetClockInfo(dev->handle, NVML_CLOCK_COUNT,
                            &dev->clock[NVML_CLOCK_COUNT]) == NVML_SUCCESS) {

    dev->feature_support |= CLOCK_INFO;
  }

  if(nvmlDeviceGetFanSpeed(dev->handle, &dev->fan) == NVML_SUCCESS) {
    dev->feature_support |= FAN_INFO;
  }

  if(nvmlDeviceGetUtilizationRates(dev->handle, &dev->util) == NVML_SUCCESS) {
    dev->feature_support |= UTILIZATION_INFO;
  }
}
Example #2
0
	void CMeasureNVML<TSkipMs, TVariant>::measure(void *pMsMeasurement, int32_t& rThreadNum) {
		nvmlReturn_t result;
		MS_MEASUREMENT_GPU *pMsMeasurementGpu = (MS_MEASUREMENT_GPU *) pMsMeasurement;
		
		result = nvmlDeviceGetPowerUsage(mDevice, &(pMsMeasurementGpu->nvml_power_cur));
		if (NVML_SUCCESS != result) {
			mrLog.lock();
			mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread #" << rThreadNum << "): Error: no power usage reading possible. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			mrLog.unlock();
			exit(EXIT_FAILURE);
		}
		
		if(TVariant == VARIANT_FULL) {
			nvmlMemory_t memory;
			if(!(mMeasureCounter++ % TSkipMs)) {
				result = nvmlDeviceGetMemoryInfo(mDevice, &memory);
				if (NVML_SUCCESS != result) {
					mrLog.lock();
					mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread #" << rThreadNum << "): Error: cannot obtain memory informations. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
					mrLog.unlock();
					exit(EXIT_FAILURE);
				}
				pMsMeasurementGpu->nvml_memory_free_cur = (uint32_t)(memory.free >> 10);
				pMsMeasurementGpu->nvml_memory_used_cur = (uint32_t)(memory.used >> 10);
				
				result = nvmlDeviceGetPerformanceState(mDevice, (nvmlPstates_t*)&(pMsMeasurementGpu->internal.nvml_power_state));
				if (NVML_SUCCESS != result) {
					mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread #" << rThreadNum << "): Error: no performance state reading possible. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
					exit(EXIT_FAILURE);
				}
			
				nvmlTemperatureSensors_t sensorType = NVML_TEMPERATURE_GPU;
			
				result = nvmlDeviceGetTemperature(mDevice, sensorType, &(pMsMeasurementGpu->nvml_temperature_cur));
				if (NVML_SUCCESS != result) {
					mrLog.lock();
					mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread #" << rThreadNum << "): Error: cannot read temperature. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
					mrLog.unlock();
					exit(EXIT_FAILURE);
				}
			
				result = nvmlDeviceGetClockInfo(mDevice, NVML_CLOCK_SM, &(pMsMeasurementGpu->nvml_clock_sm_cur));
				if (NVML_SUCCESS != result) {
					mrLog.lock();
					mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread #" << rThreadNum << "): Error: cannot read frequency. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
					mrLog.unlock();
					exit(EXIT_FAILURE);
				}
			
				result = nvmlDeviceGetClockInfo(mDevice, NVML_CLOCK_MEM, &(pMsMeasurementGpu->nvml_clock_mem_cur));
				if (NVML_SUCCESS != result) {
					mrLog.lock();
					mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread #" << rThreadNum << "): Error: cannot read frequency. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
					mrLog.unlock();
					exit(EXIT_FAILURE);
				}
			}
Example #3
0
		unsigned long long
getPowerUsage( nvmlDevice_t dev )
{
		unsigned int power;
		nvmlReturn_t bad; 
		bad = nvmlDeviceGetPowerUsage( dev, &power );

		if ( NVML_SUCCESS != bad ) {
				SUBDBG( "something went wrong %s\n", nvmlErrorString(bad));
		}


		return (unsigned long long) power;
}
Example #4
0
static void update_device_info(struct monitor* mon)
{
  // TODO: NVML is thread safe, and the order we grab GPU information
  // here doesn't particularly matter, so might as well take advantage
  // of parallelism here.

  unsigned i;

  for(i = 0; i < mon->dev_count; ++i) {
    struct device* dev = &mon->devices[i];

    if(dev->feature_support & MEMORY_INFO) {
      NVML_TRY(nvmlDeviceGetMemoryInfo(dev->handle, &dev->memory));
    }

    if(dev->feature_support & TEMPERATURE) {
      NVML_TRY(nvmlDeviceGetTemperature(dev->handle, NVML_TEMPERATURE_GPU,
                                        &dev->temperature));
    }

    if(dev->feature_support & POWER_USAGE) {
      NVML_TRY(nvmlDeviceGetPowerUsage(dev->handle, &dev->power_usage));
    }

    if(dev->feature_support & CLOCK_INFO) {
      for(nvmlClockType_t type = NVML_CLOCK_GRAPHICS; type < NVML_CLOCK_COUNT;
          ++type) {
        NVML_TRY(nvmlDeviceGetClockInfo(dev->handle, type, &dev->clock[type]));
      }
    }

    if(dev->feature_support & FAN_INFO) {
      NVML_TRY(nvmlDeviceGetFanSpeed(dev->handle, &dev->fan));
    }

    if(dev->event_set != NULL) {
      nvmlEventData_t data;

      NVML_TRY(nvmlEventSetWait(dev->event_set, &data, 1));

      // TODO: Do something with the returned information.
    }
  }

  mon->last_update = time(NULL);
}
Example #5
0
  /*_________________---------------------------__________________
    _________________     nvml_tick             __________________
    -----------------___________________________------------------
    Called every second
  */
  void nvml_tick(HSP *sp) {
    if(sp->nvml.gpu_count > 0) {
      unsigned int i;

      for (i = 0; i < sp->nvml.gpu_count; ++i) {
        nvmlDevice_t gpu;
        unsigned int power_mW;
	nvmlUtilization_t util;

        if (NVML_SUCCESS != nvmlDeviceGetHandleByIndex(i, &gpu)) {
          continue;
        }
        if (NVML_SUCCESS == nvmlDeviceGetUtilizationRates(gpu, &util)) {
	  sp->nvml.nvml_gpu_time += util.gpu * 10; // accumulate as mS
	  sp->nvml.nvml_mem_time += util.memory * 10; // accumulate as mS
        }
        if (NVML_SUCCESS == nvmlDeviceGetPowerUsage(gpu, &power_mW)) {
	  sp->nvml.nvml_energy += power_mW; // accumulate as mJ
        }
      }

    }
  }
Example #6
0
		static int 
detectDevices( ) 
{
		nvmlReturn_t ret;
		nvmlEnableState_t mode = NVML_FEATURE_DISABLED;
		nvmlDevice_t handle;
		nvmlPciInfo_t info;

		cudaError_t cuerr;

		char busId[16];
		char name[64];
		char inforomECC[16];
		char inforomPower[16];
		char names[device_count][64];
		char nvml_busIds[device_count][16];

		float ecc_version = 0.0, power_version = 0.0;

		int i = 0,
			j = 0;
		int isTesla = 0;
		int isFermi	= 0;
		int isUnique = 1;

		unsigned int temp = 0;


		/* list of nvml pci_busids */
	for (i=0; i < device_count; i++) {
		ret = nvmlDeviceGetHandleByIndex( i, &handle );	
		if ( NVML_SUCCESS != ret ) {
			SUBDBG("nvmlDeviceGetHandleByIndex(%d) failed\n", i);
			return PAPI_ESYS;
		}

		ret = nvmlDeviceGetPciInfo( handle, &info );
		if ( NVML_SUCCESS != ret ) {
			SUBDBG("nvmlDeviceGetPciInfo() failed %s\n", nvmlErrorString(ret) );
			return PAPI_ESYS;
		}

		strncpy(nvml_busIds[i], info.busId, 16);
		
	}

	/* We want to key our list of nvmlDevice_ts by each device's cuda index */
	for (i=0; i < device_count; i++) {
			cuerr = cudaDeviceGetPCIBusId( busId, 16, i );
			if ( CUDA_SUCCESS != cuerr ) {
				SUBDBG("cudaDeviceGetPCIBusId failed.\n");
				return PAPI_ESYS;
			}
			for (j=0; j < device_count; j++ ) {
					if ( !strncmp( busId, nvml_busIds[j], 16) ) {
							ret = nvmlDeviceGetHandleByIndex(j, &devices[i] );
							if ( NVML_SUCCESS != ret )
								SUBDBG("nvmlDeviceGetHandleByIndex(%d, &devices[%d]) failed.\n", j, i);
								return PAPI_ESYS;
							break;
					}
			}	
	}

		memset(names, 0x0, device_count*64);
		/* So for each card, check whats querable */
		for (i=0; i < device_count; i++ ) {
				isTesla=0;
				isFermi=1;
				isUnique = 1;
				features[i] = 0;

				ret = nvmlDeviceGetName( devices[i], name, 64 );
				if ( NVML_SUCCESS != ret) {
					SUBDBG("nvmlDeviceGetName failed \n");
					return PAPI_ESYS;
				}

				for (j=0; j < i; j++ ) 
						if ( 0 == strncmp( name, names[j], 64 ) ) {
								/* if we have a match, and IF everything is sane, 
								 * devices with the same name eg Tesla C2075 share features */
								isUnique = 0;
								features[i] = features[j];

						}

				if ( isUnique ) {
						ret = nvmlDeviceGetInforomVersion( devices[i], NVML_INFOROM_ECC, inforomECC, 16);
						if ( NVML_SUCCESS != ret ) {
								SUBDBG("nvmlGetInforomVersion carps %s\n", nvmlErrorString(ret ) );
								isFermi = 0;
						}
						ret = nvmlDeviceGetInforomVersion( devices[i], NVML_INFOROM_POWER, inforomPower, 16);
						if ( NVML_SUCCESS != ret ) {
								/* This implies the card is older then Fermi */
								SUBDBG("nvmlGetInforomVersion carps %s\n", nvmlErrorString(ret ) );
								SUBDBG("Based upon the return to nvmlGetInforomVersion, we conclude this card is older then Fermi.\n");
								isFermi = 0;
						} 

						ecc_version = strtof(inforomECC, NULL );
						power_version = strtof( inforomPower, NULL);

						ret = nvmlDeviceGetName( devices[i], name, 64 );
						isTesla = ( NULL == strstr(name, "Tesla") ) ? 0:1;

						/* For Tesla and Quadro products from Fermi and Kepler families. */
						if ( isFermi ) {
								features[i] |= FEATURE_CLOCK_INFO;
								num_events += 3;
						}

						/* 	For Tesla and Quadro products from Fermi and Kepler families. 
							requires NVML_INFOROM_ECC 2.0 or higher for location-based counts
							requires NVML_INFOROM_ECC 1.0 or higher for all other ECC counts
							requires ECC mode to be enabled. */
						if ( isFermi ) {
								ret = nvmlDeviceGetEccMode( devices[i], &mode, NULL );
								if ( NVML_FEATURE_ENABLED == mode) {
										if ( ecc_version >= 2.0 ) {
												features[i] |= FEATURE_ECC_LOCAL_ERRORS;
												num_events += 8; /* {single bit, two bit errors} x { reg, l1, l2, memory } */
										} 
										if ( ecc_version >= 1.0 ) {
												features[i] |= FEATURE_ECC_TOTAL_ERRORS;
												num_events += 2; /* single bit errors, double bit errors */
										}
								}	
						}

						/* For all discrete products with dedicated fans */
						features[i] |= FEATURE_FAN_SPEED;
						num_events++;

						/* For Tesla and Quadro products from Fermi and Kepler families. */
						if ( isFermi ) {
								features[i] |= FEATURE_MAX_CLOCK;
								num_events += 3;
						}

						/* For all products */
						features[i] |= FEATURE_MEMORY_INFO;
						num_events += 3; /* total, free, used */

						/* For Tesla and Quadro products from the Fermi and Kepler families. */
						if ( isFermi ) {
								features[i] |= FEATURE_PERF_STATES;
								num_events++;
						}

						/* 	For "GF11x" Tesla and Quadro products from the Fermi family
							requires NVML_INFOROM_POWER 3.0 or higher
							For Tesla and Quadro products from the Kepler family
							does not require NVML_INFOROM_POWER */
						if ( isFermi ) {
								ret = nvmlDeviceGetPowerUsage( devices[i], &temp);
								if ( NVML_SUCCESS == ret ) {
										features[i] |= FEATURE_POWER;
										num_events++;
								}
						}

						/* For all discrete and S-class products. */
						features[i] |= FEATURE_TEMP;
						num_events++;

						/* For Tesla and Quadro products from the Fermi and Kepler families */
						if (isFermi) {
								features[i] |= FEATURE_UTILIZATION;
								num_events += 2;
						}

						strncpy( names[i], name, 64); 

				}
		}
		return PAPI_OK;

}