static int get_mem_info(unsigned int*ncores,unsigned int*usedarray) { nvmlReturn_t ret; ret=nvmlInit(); if(ret!=NVML_SUCCESS) { fprintf(stderr,"ERROR:: Initialize NVML{%s}..\n",nvmlErrorString(ret)); return -1; } unsigned int c; ret=nvmlDeviceGetCount(&c); if(ret!=NVML_SUCCESS) { fprintf(stderr,"ERROR:: Device Get Count{%s}..\n",nvmlErrorString(ret)); return -1; } *ncores=c; nvmlDevice_t devs[NDEV]; nvmlMemory_t meminfo; int i; for(i=0; i<c; i++) { ret=nvmlDeviceGetHandleByIndex(i,&devs[i]); if(ret!=NVML_SUCCESS) { fprintf(stderr,"ERROR:: Device Get Handle{%s}..\n",nvmlErrorString(ret)); return -1; } ret=nvmlDeviceGetMemoryInfo(devs[i],&meminfo); if(ret!=NVML_SUCCESS) { fprintf(stderr,"ERROR:: GetMemoryInfo{%s}..\n",nvmlErrorString(ret)); return -1; } usedarray[i]=meminfo.used; } ret=nvmlShutdown(); if(ret!=NVML_SUCCESS) { fprintf(stderr,"ERROR:: Shutdown NVML{%s}..\n",nvmlErrorString(ret)); return -1; } return 0; }
unsigned long long getEccLocalErrors( nvmlDevice_t dev, nvmlEccBitType_t bits, int which_one) { nvmlEccErrorCounts_t counts; nvmlReturn_t bad; bad = nvmlDeviceGetDetailedEccErrors( dev, bits, NVML_VOLATILE_ECC , &counts); if ( NVML_SUCCESS != bad ) { SUBDBG( "something went wrong %s\n", nvmlErrorString(bad)); } switch ( which_one ) { case LOCAL_ECC_REGFILE: return counts.registerFile; case LOCAL_ECC_L1: return counts.l1Cache; case LOCAL_ECC_L2: return counts.l2Cache; case LOCAL_ECC_MEM: return counts.deviceMemory; default: ; } return (unsigned long long)-1; }
static void check_nvml_error(nvmlReturn_t nvret,const char*msg,int line) { if(nvret!=NVML_SUCCESS){ fprintf(stderr,"NVML:@%d::%s:{%s}\n",line,msg,nvmlErrorString(nvret)); exit(1); } }
unsigned long long getPState( nvmlDevice_t dev ) { unsigned int ret = 0; nvmlPstates_t state = NVML_PSTATE_15; nvmlReturn_t bad; bad = nvmlDeviceGetPerformanceState( dev, &state ); if ( NVML_SUCCESS != bad ) { SUBDBG( "something went wrong %s\n", nvmlErrorString(bad)); } switch ( state ) { case NVML_PSTATE_15: ret++; case NVML_PSTATE_14: ret++; case NVML_PSTATE_13: ret++; case NVML_PSTATE_12: ret++; case NVML_PSTATE_11: ret++; case NVML_PSTATE_10: ret++; case NVML_PSTATE_9: ret++; case NVML_PSTATE_8: ret++; case NVML_PSTATE_7: ret++; case NVML_PSTATE_6: ret++; case NVML_PSTATE_5: ret++; case NVML_PSTATE_4: ret++; case NVML_PSTATE_3: ret++; case NVML_PSTATE_2: ret++; case NVML_PSTATE_1: ret++; case NVML_PSTATE_0: break; case NVML_PSTATE_UNKNOWN: default: /* This should never happen? * The API docs just state Unknown performance state... */ return (unsigned long long) -1; } return (unsigned long long)ret; }
unsigned long long getClockSpeed( nvmlDevice_t dev, nvmlClockType_t which_one ) { unsigned int ret = 0; nvmlReturn_t bad; bad = nvmlDeviceGetClockInfo( dev, which_one, &ret ); if ( NVML_SUCCESS != bad ) { SUBDBG( "something went wrong %s\n", nvmlErrorString(bad)); } return (unsigned long long)ret; }
unsigned long long getTotalEccErrors( nvmlDevice_t dev, nvmlEccBitType_t bits) { unsigned long long counts = 0; nvmlReturn_t bad; bad = nvmlDeviceGetTotalEccErrors( dev, bits, NVML_VOLATILE_ECC , &counts); if ( NVML_SUCCESS != bad ) { SUBDBG( "something went wrong %s\n", nvmlErrorString(bad)); } return counts; }
unsigned long long getTemperature( nvmlDevice_t dev ) { unsigned int ret = 0; nvmlReturn_t bad; bad = nvmlDeviceGetTemperature( dev, NVML_TEMPERATURE_GPU, &ret ); if ( NVML_SUCCESS != bad ) { SUBDBG( "something went wrong %s\n", nvmlErrorString(bad)); } return (unsigned long long)ret; }
unsigned long long getPowerUsage( nvmlDevice_t dev ) { unsigned int power; nvmlReturn_t bad; bad = nvmlDeviceGetPowerUsage( dev, &power ); if ( NVML_SUCCESS != bad ) { SUBDBG( "something went wrong %s\n", nvmlErrorString(bad)); } return (unsigned long long) power; }
unsigned long long getFanSpeed( nvmlDevice_t dev ) { unsigned int ret = 0; nvmlReturn_t bad; bad = nvmlDeviceGetFanSpeed( dev, &ret ); if ( NVML_SUCCESS != bad ) { SUBDBG( "something went wrong %s\n", nvmlErrorString(bad)); } return (unsigned long long)ret; }
// NVIDIA NVML library function wrapper for GPU DVFS. int SetGPUFreq(unsigned int clock_mem, unsigned int clock_core) { nvmlDevice_t device;//int device; nvmlReturn_t result; result = nvmlInit(); result = nvmlDeviceGetHandleByIndex(0, &device);//cudaGetDevice(&device); result = nvmlDeviceSetApplicationsClocks(device, clock_mem, clock_core);//(nvmlDevice_t)device if(result != NVML_SUCCESS) { printf("Failed to set GPU core and memory frequencies: %s\n", nvmlErrorString(result)); return 1; } else { nvmlDeviceGetApplicationsClock(device, NVML_CLOCK_GRAPHICS, &clock_core); nvmlDeviceGetApplicationsClock(device, NVML_CLOCK_MEM, &clock_mem); ////printf("GPU core frequency is now set to %d MHz; GPU memory frequency is now set to %d MHz", clock_core, clock_mem); return 0; } }
/* 0 => gpu util 1 => memory util */ unsigned long long getUtilization( nvmlDevice_t dev, int which_one ) { nvmlUtilization_t util; nvmlReturn_t bad; bad = nvmlDeviceGetUtilizationRates( dev, &util ); if ( NVML_SUCCESS != bad ) { SUBDBG( "something went wrong %s\n", nvmlErrorString(bad)); } switch (which_one) { case GPU_UTILIZATION: return (unsigned long long) util.gpu; case MEMORY_UTILIZATION: return (unsigned long long) util.memory; default: ; } return (unsigned long long) -1; }
unsigned long long getMemoryInfo( nvmlDevice_t dev, int which_one ) { nvmlMemory_t meminfo; nvmlReturn_t bad; bad = nvmlDeviceGetMemoryInfo( dev, &meminfo ); if ( NVML_SUCCESS != bad ) { SUBDBG( "something went wrong %s\n", nvmlErrorString(bad)); } switch (which_one) { case MEMINFO_TOTAL_MEMORY: return meminfo.total; case MEMINFO_UNALLOCED: return meminfo.free; case MEMINFO_ALLOCED: return meminfo.used; default: ; } return (unsigned long long)-1; }
/* * Class: org_apache_hadoop_yarn_server_nodemanager_containermanager_launcher_GPUMonitor * Method: initnvml * Signature: ()Ljava/lang/String; */ JNIEXPORT jstring JNICALL Java_org_apache_hadoop_yarn_server_nodemanager_containermanager_launcher_GPUMonitor_initnvml (JNIEnv *env, jobject) { nvmlReturn_t result; unsigned int device_count, i; char sentence[200]; std::string err = ""; result = nvmlInit(); if (NVML_SUCCESS != result) { printf("Failed to initialize NVML: %s\n", nvmlErrorString(result)); sprintf(sentence, "Failed to initialize NVML: %s\n", nvmlErrorString(result)); err.append( (std::string)sentence ); } char name[NVML_DEVICE_NAME_BUFFER_SIZE]; result = nvmlDeviceGetHandleByIndex(0, &device); if (NVML_SUCCESS != result) { printf("Failed to get handle for device %i: %s\n", i, nvmlErrorString(result)); sprintf(sentence,"Failed to get handle for device %i: %s\n", i, nvmlErrorString(result)); err.append( (std::string)sentence ); result = nvmlShutdown(); return 0; } result = nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE); if (NVML_SUCCESS != result) { printf("Failed to get name of device %i: %s\n", i, nvmlErrorString(result)); sprintf(sentence,"Failed to get name of device %i: %s\n", i, nvmlErrorString(result)); err.append( (std::string)sentence ); result = nvmlShutdown(); return 0; } printf("Device : %s\n",name); sprintf(sentence,"Device : %s\n",name); err.append( (std::string)sentence ); return env->NewStringUTF( err.c_str() ); }
static int detectDevices( ) { nvmlReturn_t ret; nvmlEnableState_t mode = NVML_FEATURE_DISABLED; nvmlDevice_t handle; nvmlPciInfo_t info; cudaError_t cuerr; char busId[16]; char name[64]; char inforomECC[16]; char inforomPower[16]; char names[device_count][64]; char nvml_busIds[device_count][16]; float ecc_version = 0.0, power_version = 0.0; int i = 0, j = 0; int isTesla = 0; int isFermi = 0; int isUnique = 1; unsigned int temp = 0; /* list of nvml pci_busids */ for (i=0; i < device_count; i++) { ret = nvmlDeviceGetHandleByIndex( i, &handle ); if ( NVML_SUCCESS != ret ) { SUBDBG("nvmlDeviceGetHandleByIndex(%d) failed\n", i); return PAPI_ESYS; } ret = nvmlDeviceGetPciInfo( handle, &info ); if ( NVML_SUCCESS != ret ) { SUBDBG("nvmlDeviceGetPciInfo() failed %s\n", nvmlErrorString(ret) ); return PAPI_ESYS; } strncpy(nvml_busIds[i], info.busId, 16); } /* We want to key our list of nvmlDevice_ts by each device's cuda index */ for (i=0; i < device_count; i++) { cuerr = cudaDeviceGetPCIBusId( busId, 16, i ); if ( CUDA_SUCCESS != cuerr ) { SUBDBG("cudaDeviceGetPCIBusId failed.\n"); return PAPI_ESYS; } for (j=0; j < device_count; j++ ) { if ( !strncmp( busId, nvml_busIds[j], 16) ) { ret = nvmlDeviceGetHandleByIndex(j, &devices[i] ); if ( NVML_SUCCESS != ret ) SUBDBG("nvmlDeviceGetHandleByIndex(%d, &devices[%d]) failed.\n", j, i); return PAPI_ESYS; break; } } } memset(names, 0x0, device_count*64); /* So for each card, check whats querable */ for (i=0; i < device_count; i++ ) { isTesla=0; isFermi=1; isUnique = 1; features[i] = 0; ret = nvmlDeviceGetName( devices[i], name, 64 ); if ( NVML_SUCCESS != ret) { SUBDBG("nvmlDeviceGetName failed \n"); return PAPI_ESYS; } for (j=0; j < i; j++ ) if ( 0 == strncmp( name, names[j], 64 ) ) { /* if we have a match, and IF everything is sane, * devices with the same name eg Tesla C2075 share features */ isUnique = 0; features[i] = features[j]; } if ( isUnique ) { ret = nvmlDeviceGetInforomVersion( devices[i], NVML_INFOROM_ECC, inforomECC, 16); if ( NVML_SUCCESS != ret ) { SUBDBG("nvmlGetInforomVersion carps %s\n", nvmlErrorString(ret ) ); isFermi = 0; } ret = nvmlDeviceGetInforomVersion( devices[i], NVML_INFOROM_POWER, inforomPower, 16); if ( NVML_SUCCESS != ret ) { /* This implies the card is older then Fermi */ SUBDBG("nvmlGetInforomVersion carps %s\n", nvmlErrorString(ret ) ); SUBDBG("Based upon the return to nvmlGetInforomVersion, we conclude this card is older then Fermi.\n"); isFermi = 0; } ecc_version = strtof(inforomECC, NULL ); power_version = strtof( inforomPower, NULL); ret = nvmlDeviceGetName( devices[i], name, 64 ); isTesla = ( NULL == strstr(name, "Tesla") ) ? 0:1; /* For Tesla and Quadro products from Fermi and Kepler families. */ if ( isFermi ) { features[i] |= FEATURE_CLOCK_INFO; num_events += 3; } /* For Tesla and Quadro products from Fermi and Kepler families. requires NVML_INFOROM_ECC 2.0 or higher for location-based counts requires NVML_INFOROM_ECC 1.0 or higher for all other ECC counts requires ECC mode to be enabled. */ if ( isFermi ) { ret = nvmlDeviceGetEccMode( devices[i], &mode, NULL ); if ( NVML_FEATURE_ENABLED == mode) { if ( ecc_version >= 2.0 ) { features[i] |= FEATURE_ECC_LOCAL_ERRORS; num_events += 8; /* {single bit, two bit errors} x { reg, l1, l2, memory } */ } if ( ecc_version >= 1.0 ) { features[i] |= FEATURE_ECC_TOTAL_ERRORS; num_events += 2; /* single bit errors, double bit errors */ } } } /* For all discrete products with dedicated fans */ features[i] |= FEATURE_FAN_SPEED; num_events++; /* For Tesla and Quadro products from Fermi and Kepler families. */ if ( isFermi ) { features[i] |= FEATURE_MAX_CLOCK; num_events += 3; } /* For all products */ features[i] |= FEATURE_MEMORY_INFO; num_events += 3; /* total, free, used */ /* For Tesla and Quadro products from the Fermi and Kepler families. */ if ( isFermi ) { features[i] |= FEATURE_PERF_STATES; num_events++; } /* For "GF11x" Tesla and Quadro products from the Fermi family requires NVML_INFOROM_POWER 3.0 or higher For Tesla and Quadro products from the Kepler family does not require NVML_INFOROM_POWER */ if ( isFermi ) { ret = nvmlDeviceGetPowerUsage( devices[i], &temp); if ( NVML_SUCCESS == ret ) { features[i] |= FEATURE_POWER; num_events++; } } /* For all discrete and S-class products. */ features[i] |= FEATURE_TEMP; num_events++; /* For Tesla and Quadro products from the Fermi and Kepler families */ if (isFermi) { features[i] |= FEATURE_UTILIZATION; num_events += 2; } strncpy( names[i], name, 64); } } return PAPI_OK; }
static int get_process_info(unsigned int*ncores,unsigned int *valarray) { nvmlReturn_t ret; ret=nvmlInit(); if(ret!=NVML_SUCCESS) { fprintf(stderr,"ERROR:: Initialize NVML{%s}..\n",nvmlErrorString(ret)); return -1; } unsigned int c; ret=nvmlDeviceGetCount(&c); if(ret!=NVML_SUCCESS) { fprintf(stderr,"ERROR:: Device Get Count{%s}..\n",nvmlErrorString(ret)); return -1; } *ncores=c; /* if(c!=NDEV){ fprintf(stderr,"ERROR:: Current number of Cores is [%d],not %d....YOU NEED RECOMPILE THIS ROUTINE\n",c,NDEV); return -2; } */ nvmlDevice_t devs[NDEV]; nvmlProcessInfo_t pis[MAXPROC]; int i; for(i=0; i<c; i++) { ret=nvmlDeviceGetHandleByIndex(i,&devs[i]); if(ret!=NVML_SUCCESS) { fprintf(stderr,"ERROR:: Device Get Handle{%s}..\n",nvmlErrorString(ret)); return -1; } unsigned int np=MAXPROC; ret=nvmlDeviceGetComputeRunningProcesses(devs[i],&np,pis); if(ret!=NVML_SUCCESS) { fprintf(stderr,"ERROR:: GetRunningProcess{%s}..\n",nvmlErrorString(ret)); return -1; } valarray[i]=np; } ret=nvmlShutdown(); if(ret!=NVML_SUCCESS) { fprintf(stderr,"ERROR:: Shutdown NVML{%s}..\n",nvmlErrorString(ret)); return -1; } return 0; }
int main() { nvmlReturn_t result; unsigned int device_count, i; // First initialize NVML library result = nvmlInit(); if (NVML_SUCCESS != result) { printf("Failed to initialize NVML: %s\n", nvmlErrorString(result)); printf("Press ENTER to continue...\n"); getchar(); return 1; } result = nvmlDeviceGetCount(&device_count); if (NVML_SUCCESS != result) { printf("Failed to query device count: %s\n", nvmlErrorString(result)); goto Error; } printf("Found %d device%s\n\n", device_count, device_count != 1 ? "s" : ""); printf("Listing devices:\n"); for (i = 0; i < device_count; i++) { nvmlDevice_t device; char name[NVML_DEVICE_NAME_BUFFER_SIZE]; nvmlPciInfo_t pci; nvmlComputeMode_t compute_mode; // Query for device handle to perform operations on a device // You can also query device handle by other features like: // nvmlDeviceGetHandleBySerial // nvmlDeviceGetHandleByPciBusId result = nvmlDeviceGetHandleByIndex(i, &device); if (NVML_SUCCESS != result) { printf("Failed to get handle for device %i: %s\n", i, nvmlErrorString(result)); goto Error; } result = nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE); if (NVML_SUCCESS != result) { printf("Failed to get name of device %i: %s\n", i, nvmlErrorString(result)); goto Error; } // pci.busId is very useful to know which device physically you're talking to // Using PCI identifier you can also match nvmlDevice handle to CUDA device. result = nvmlDeviceGetPciInfo(device, &pci); if (NVML_SUCCESS != result) { printf("Failed to get pci info for device %i: %s\n", i, nvmlErrorString(result)); goto Error; } printf("%d. %s [%s]\n", i, name, pci.busId); // This is a simple example on how you can modify GPU's state result = nvmlDeviceGetComputeMode(device, &compute_mode); if (NVML_ERROR_NOT_SUPPORTED == result) printf("\t This is not CUDA capable device\n"); else if (NVML_SUCCESS != result) { printf("Failed to get compute mode for device %i: %s\n", i, nvmlErrorString(result)); goto Error; } else { // try to change compute mode printf("\t Changing device's compute mode from '%s' to '%s'\n", convertToComputeModeString(compute_mode), convertToComputeModeString(NVML_COMPUTEMODE_PROHIBITED)); result = nvmlDeviceSetComputeMode(device, NVML_COMPUTEMODE_PROHIBITED); if (NVML_ERROR_NO_PERMISSION == result) printf("\t\t Need root privileges to do that: %s\n", nvmlErrorString(result)); else if (NVML_ERROR_NOT_SUPPORTED == result) printf("\t\t Compute mode prohibited not supported. You might be running on\n" "\t\t windows in WDDM driver model or on non-CUDA capable GPU.\n"); else if (NVML_SUCCESS != result) { printf("\t\t Failed to set compute mode for device %i: %s\n", i, nvmlErrorString(result)); goto Error; } else { printf("\t Restoring device's compute mode back to '%s'\n", convertToComputeModeString(compute_mode)); result = nvmlDeviceSetComputeMode(device, compute_mode); if (NVML_SUCCESS != result) { printf("\t\t Failed to restore compute mode for device %i: %s\n", i, nvmlErrorString(result)); goto Error; } } } } result = nvmlShutdown(); if (NVML_SUCCESS != result) printf("Failed to shutdown NVML: %s\n", nvmlErrorString(result)); printf("All done.\n"); printf("Press ENTER to continue...\n"); getchar(); return 0; Error: result = nvmlShutdown(); if (NVML_SUCCESS != result) printf("Failed to shutdown NVML: %s\n", nvmlErrorString(result)); printf("Press ENTER to continue...\n"); getchar(); return 1; }