int probe_gpustats(devstat**stats) { unsigned int n_dev; nvmlReturn_t nvret; nvret=nvmlInit(); CHK_NVML(nvret,"Init NVML"); nvret=nvmlDeviceGetCount(&n_dev); CHK_NVML(nvret,"getCount"); *stats=(devstat*)calloc(n_dev,sizeof(devstat)); devstat*pstats=*stats; int i; for(i=0;i<n_dev;i++) nvmlDeviceGetHandleByIndex(i,&pstats[i].handler); for(i=0;i<n_dev;i++) nvmlDeviceGetMemoryInfo(pstats[i].handler,&pstats[i].meminfo); for(i=0;i<n_dev;i++) nvmlDeviceGetUtilizationRates(pstats[i].handler,&pstats[i].utils); unsigned int sampp; for(i=0;i<n_dev;i++) nvmlDeviceGetEncoderUtilization(pstats[i].handler,&pstats[i].encutil,&sampp); for(i=0;i<n_dev;i++) nvmlDeviceGetDecoderUtilization(pstats[i].handler,&pstats[i].decutil,&sampp); #if 0 int maxfreeind=0; int maxfree=0; for(i=0;i<n_dev;i++){ print_devstats(&pstats[i]); int free=pstats[i].meminfo.free; // fprintf(stderr,"<%d\n",free); if(free>maxfree){ maxfree=free; maxfreeind=i; } } #endif nvret=nvmlShutdown(); CHK_NVML(nvret,"Shutdown NVML"); return n_dev; }
static int get_mem_info(unsigned int*ncores,unsigned int*usedarray) { nvmlReturn_t ret; ret=nvmlInit(); if(ret!=NVML_SUCCESS) { fprintf(stderr,"ERROR:: Initialize NVML{%s}..\n",nvmlErrorString(ret)); return -1; } unsigned int c; ret=nvmlDeviceGetCount(&c); if(ret!=NVML_SUCCESS) { fprintf(stderr,"ERROR:: Device Get Count{%s}..\n",nvmlErrorString(ret)); return -1; } *ncores=c; nvmlDevice_t devs[NDEV]; nvmlMemory_t meminfo; int i; for(i=0; i<c; i++) { ret=nvmlDeviceGetHandleByIndex(i,&devs[i]); if(ret!=NVML_SUCCESS) { fprintf(stderr,"ERROR:: Device Get Handle{%s}..\n",nvmlErrorString(ret)); return -1; } ret=nvmlDeviceGetMemoryInfo(devs[i],&meminfo); if(ret!=NVML_SUCCESS) { fprintf(stderr,"ERROR:: GetMemoryInfo{%s}..\n",nvmlErrorString(ret)); return -1; } usedarray[i]=meminfo.used; } ret=nvmlShutdown(); if(ret!=NVML_SUCCESS) { fprintf(stderr,"ERROR:: Shutdown NVML{%s}..\n",nvmlErrorString(ret)); return -1; } return 0; }
static void init_device_info(struct monitor* mon) { gethostname(mon->hostname, 64); NVML_TRY(nvmlSystemGetDriverVersion(mon->driver_version, sizeof(mon->driver_version))); NVML_TRY(nvmlSystemGetNVMLVersion(mon->nvml_version, sizeof(mon->nvml_version))); NVML_TRY(nvmlDeviceGetCount(&mon->dev_count)); mon->devices = calloc(mon->dev_count, sizeof(struct device)); for(unsigned i = 0; i < mon->dev_count; ++i) { struct device dev; memset(&dev, 0, sizeof(struct device)); dev.index = i; NVML_TRY(nvmlDeviceGetHandleByIndex(i, &dev.handle)); NVML_TRY(nvmlDeviceGetName(dev.handle, dev.name, sizeof(dev.name))); NVML_TRY(nvmlDeviceGetSerial(dev.handle, dev.serial, sizeof(dev.serial))); NVML_TRY(nvmlDeviceGetUUID(dev.handle, dev.uuid, sizeof(dev.uuid))); NVML_TRY(nvmlDeviceGetPciInfo(dev.handle, &dev.pci)); NVML_TRY(nvmlDeviceGetMemoryInfo(dev.handle, &dev.memory)); unsigned long long event_types; NVML_TRY(nvmlEventSetCreate(&dev.event_set)); if(0 == NVML_TRY(nvmlDeviceGetSupportedEventTypes(dev.handle, &event_types))) { NVML_TRY(nvmlDeviceRegisterEvents(dev.handle, event_types, dev.event_set)); } else { dev.event_set = NULL; } for(nvmlClockType_t type = NVML_CLOCK_GRAPHICS; type < NVML_CLOCK_COUNT; ++type) { if(NVML_TRY(nvmlDeviceGetMaxClockInfo(dev.handle, type, &dev.max_clock[type]))) break; } get_device_features(&dev); mon->devices[i] = dev; } mon->last_update = time(NULL); }
// NVIDIA NVML library function wrapper for GPU DVFS. int SetGPUFreq(unsigned int clock_mem, unsigned int clock_core) { nvmlDevice_t device;//int device; nvmlReturn_t result; result = nvmlInit(); result = nvmlDeviceGetHandleByIndex(0, &device);//cudaGetDevice(&device); result = nvmlDeviceSetApplicationsClocks(device, clock_mem, clock_core);//(nvmlDevice_t)device if(result != NVML_SUCCESS) { printf("Failed to set GPU core and memory frequencies: %s\n", nvmlErrorString(result)); return 1; } else { nvmlDeviceGetApplicationsClock(device, NVML_CLOCK_GRAPHICS, &clock_core); nvmlDeviceGetApplicationsClock(device, NVML_CLOCK_MEM, &clock_mem); ////printf("GPU core frequency is now set to %d MHz; GPU memory frequency is now set to %d MHz", clock_core, clock_mem); return 0; } }
void update_temperature(void) { #if (ENABLE_NVML==1) unsigned int deviceCount; NVML_CHECK(nvmlDeviceGetCount( &deviceCount )); for( unsigned int devIdx = 0; devIdx < deviceCount; ++devIdx ) { nvmlDevice_t devHandle; NVML_CHECK(nvmlDeviceGetHandleByIndex( devIdx, &devHandle )); unsigned int devTemperature; NVML_CHECK(nvmlDeviceGetTemperature( devHandle, NVML_TEMPERATURE_GPU, &devTemperature )); gpu_temp[devIdx] = devTemperature; DEBUG_PRINTF("temperature updated: (gpu %d) %d \n", devIdx, devTemperature); } #endif }
// set the CPU affinity for this GPU void setCpuAffinity(unsigned int rank) { std::lock_guard<std::mutex> lock(NVMLInit::m_); static thread_local NVMLInit nvml_init_; bool result = false; unsigned int deviceCount = 0U; const std::vector<int>& gpus = Caffe::gpus(); if (nvmlDeviceGetCount(&deviceCount) == NVML_SUCCESS) { CHECK_LT(rank, deviceCount); if (rank < deviceCount && rank < gpus.size() && nvmlDeviceGetHandleByIndex(gpus[rank], &nvml_init_.device_) == NVML_SUCCESS) { if (nvmlDeviceSetCpuAffinity(nvml_init_.device_) == NVML_SUCCESS) { LOG(INFO) << "NVML succeeded to set CPU affinity on device " << gpus[rank]; result = true; } } } if (!result && rank < gpus.size()) { LOG(ERROR) << "NVML failed to set CPU affinity on device " << gpus[rank]; } }
void PCI_Device::initializeGpu( int idx, hwloc_topology_t topology) { int rc; nvmlDevice_t gpu_device; id = idx; rc = nvmlDeviceGetHandleByIndex(idx, &gpu_device); if (rc != NVML_SUCCESS) { string buf; buf = "nvmlDeviceGetHandleByIndex failed for nvidia gpus"; buf = buf + name.c_str(); log_err(-1, __func__, buf.c_str()); } else { nearest_cpuset = hwloc_bitmap_alloc(); if (nearest_cpuset != NULL) { rc = hwloc_nvml_get_device_cpuset(topology, gpu_device, nearest_cpuset); if (rc != 0) { string buf; buf = "could not get cpuset of "; buf = buf + name.c_str(); log_err(-1, __func__, buf.c_str()); } hwloc_bitmap_list_snprintf(cpuset_string, MAX_CPUSET_SIZE, nearest_cpuset); } } this->type = GPU; }
/*_________________---------------------------__________________ _________________ nvml_tick __________________ -----------------___________________________------------------ Called every second */ void nvml_tick(HSP *sp) { if(sp->nvml.gpu_count > 0) { unsigned int i; for (i = 0; i < sp->nvml.gpu_count; ++i) { nvmlDevice_t gpu; unsigned int power_mW; nvmlUtilization_t util; if (NVML_SUCCESS != nvmlDeviceGetHandleByIndex(i, &gpu)) { continue; } if (NVML_SUCCESS == nvmlDeviceGetUtilizationRates(gpu, &util)) { sp->nvml.nvml_gpu_time += util.gpu * 10; // accumulate as mS sp->nvml.nvml_mem_time += util.memory * 10; // accumulate as mS } if (NVML_SUCCESS == nvmlDeviceGetPowerUsage(gpu, &power_mW)) { sp->nvml.nvml_energy += power_mW; // accumulate as mJ } } } }
/* * Class: org_apache_hadoop_yarn_server_nodemanager_containermanager_launcher_GPUMonitor * Method: initnvml * Signature: ()Ljava/lang/String; */ JNIEXPORT jstring JNICALL Java_org_apache_hadoop_yarn_server_nodemanager_containermanager_launcher_GPUMonitor_initnvml (JNIEnv *env, jobject) { nvmlReturn_t result; unsigned int device_count, i; char sentence[200]; std::string err = ""; result = nvmlInit(); if (NVML_SUCCESS != result) { printf("Failed to initialize NVML: %s\n", nvmlErrorString(result)); sprintf(sentence, "Failed to initialize NVML: %s\n", nvmlErrorString(result)); err.append( (std::string)sentence ); } char name[NVML_DEVICE_NAME_BUFFER_SIZE]; result = nvmlDeviceGetHandleByIndex(0, &device); if (NVML_SUCCESS != result) { printf("Failed to get handle for device %i: %s\n", i, nvmlErrorString(result)); sprintf(sentence,"Failed to get handle for device %i: %s\n", i, nvmlErrorString(result)); err.append( (std::string)sentence ); result = nvmlShutdown(); return 0; } result = nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE); if (NVML_SUCCESS != result) { printf("Failed to get name of device %i: %s\n", i, nvmlErrorString(result)); sprintf(sentence,"Failed to get name of device %i: %s\n", i, nvmlErrorString(result)); err.append( (std::string)sentence ); result = nvmlShutdown(); return 0; } printf("Device : %s\n",name); sprintf(sentence,"Device : %s\n",name); err.append( (std::string)sentence ); return env->NewStringUTF( err.c_str() ); }
void get_serial_number(unsigned int devIdx, char* serial) { #if (ENABLE_NVML==1) try { nvmlDevice_t devHandle; NVML_CHECK(nvmlDeviceGetHandleByIndex( devIdx, &devHandle )); unsigned int serialLength = NVML_DEVICE_SERIAL_BUFFER_SIZE; NVML_CHECK(nvmlDeviceGetSerial( devHandle, serial, serialLength )); } catch(const std::runtime_error& e) { std::strncpy( serial, "unknown (NVML runtime error)", NVML_DEVICE_SERIAL_BUFFER_SIZE); serial[NVML_DEVICE_SERIAL_BUFFER_SIZE-1] = '\0'; } #else (void)(devIdx); (void)(serial); #endif }
static int detectDevices( ) { nvmlReturn_t ret; nvmlEnableState_t mode = NVML_FEATURE_DISABLED; nvmlDevice_t handle; nvmlPciInfo_t info; cudaError_t cuerr; char busId[16]; char name[64]; char inforomECC[16]; char inforomPower[16]; char names[device_count][64]; char nvml_busIds[device_count][16]; float ecc_version = 0.0, power_version = 0.0; int i = 0, j = 0; int isTesla = 0; int isFermi = 0; int isUnique = 1; unsigned int temp = 0; /* list of nvml pci_busids */ for (i=0; i < device_count; i++) { ret = nvmlDeviceGetHandleByIndex( i, &handle ); if ( NVML_SUCCESS != ret ) { SUBDBG("nvmlDeviceGetHandleByIndex(%d) failed\n", i); return PAPI_ESYS; } ret = nvmlDeviceGetPciInfo( handle, &info ); if ( NVML_SUCCESS != ret ) { SUBDBG("nvmlDeviceGetPciInfo() failed %s\n", nvmlErrorString(ret) ); return PAPI_ESYS; } strncpy(nvml_busIds[i], info.busId, 16); } /* We want to key our list of nvmlDevice_ts by each device's cuda index */ for (i=0; i < device_count; i++) { cuerr = cudaDeviceGetPCIBusId( busId, 16, i ); if ( CUDA_SUCCESS != cuerr ) { SUBDBG("cudaDeviceGetPCIBusId failed.\n"); return PAPI_ESYS; } for (j=0; j < device_count; j++ ) { if ( !strncmp( busId, nvml_busIds[j], 16) ) { ret = nvmlDeviceGetHandleByIndex(j, &devices[i] ); if ( NVML_SUCCESS != ret ) SUBDBG("nvmlDeviceGetHandleByIndex(%d, &devices[%d]) failed.\n", j, i); return PAPI_ESYS; break; } } } memset(names, 0x0, device_count*64); /* So for each card, check whats querable */ for (i=0; i < device_count; i++ ) { isTesla=0; isFermi=1; isUnique = 1; features[i] = 0; ret = nvmlDeviceGetName( devices[i], name, 64 ); if ( NVML_SUCCESS != ret) { SUBDBG("nvmlDeviceGetName failed \n"); return PAPI_ESYS; } for (j=0; j < i; j++ ) if ( 0 == strncmp( name, names[j], 64 ) ) { /* if we have a match, and IF everything is sane, * devices with the same name eg Tesla C2075 share features */ isUnique = 0; features[i] = features[j]; } if ( isUnique ) { ret = nvmlDeviceGetInforomVersion( devices[i], NVML_INFOROM_ECC, inforomECC, 16); if ( NVML_SUCCESS != ret ) { SUBDBG("nvmlGetInforomVersion carps %s\n", nvmlErrorString(ret ) ); isFermi = 0; } ret = nvmlDeviceGetInforomVersion( devices[i], NVML_INFOROM_POWER, inforomPower, 16); if ( NVML_SUCCESS != ret ) { /* This implies the card is older then Fermi */ SUBDBG("nvmlGetInforomVersion carps %s\n", nvmlErrorString(ret ) ); SUBDBG("Based upon the return to nvmlGetInforomVersion, we conclude this card is older then Fermi.\n"); isFermi = 0; } ecc_version = strtof(inforomECC, NULL ); power_version = strtof( inforomPower, NULL); ret = nvmlDeviceGetName( devices[i], name, 64 ); isTesla = ( NULL == strstr(name, "Tesla") ) ? 0:1; /* For Tesla and Quadro products from Fermi and Kepler families. */ if ( isFermi ) { features[i] |= FEATURE_CLOCK_INFO; num_events += 3; } /* For Tesla and Quadro products from Fermi and Kepler families. requires NVML_INFOROM_ECC 2.0 or higher for location-based counts requires NVML_INFOROM_ECC 1.0 or higher for all other ECC counts requires ECC mode to be enabled. */ if ( isFermi ) { ret = nvmlDeviceGetEccMode( devices[i], &mode, NULL ); if ( NVML_FEATURE_ENABLED == mode) { if ( ecc_version >= 2.0 ) { features[i] |= FEATURE_ECC_LOCAL_ERRORS; num_events += 8; /* {single bit, two bit errors} x { reg, l1, l2, memory } */ } if ( ecc_version >= 1.0 ) { features[i] |= FEATURE_ECC_TOTAL_ERRORS; num_events += 2; /* single bit errors, double bit errors */ } } } /* For all discrete products with dedicated fans */ features[i] |= FEATURE_FAN_SPEED; num_events++; /* For Tesla and Quadro products from Fermi and Kepler families. */ if ( isFermi ) { features[i] |= FEATURE_MAX_CLOCK; num_events += 3; } /* For all products */ features[i] |= FEATURE_MEMORY_INFO; num_events += 3; /* total, free, used */ /* For Tesla and Quadro products from the Fermi and Kepler families. */ if ( isFermi ) { features[i] |= FEATURE_PERF_STATES; num_events++; } /* For "GF11x" Tesla and Quadro products from the Fermi family requires NVML_INFOROM_POWER 3.0 or higher For Tesla and Quadro products from the Kepler family does not require NVML_INFOROM_POWER */ if ( isFermi ) { ret = nvmlDeviceGetPowerUsage( devices[i], &temp); if ( NVML_SUCCESS == ret ) { features[i] |= FEATURE_POWER; num_events++; } } /* For all discrete and S-class products. */ features[i] |= FEATURE_TEMP; num_events++; /* For Tesla and Quadro products from the Fermi and Kepler families */ if (isFermi) { features[i] |= FEATURE_UTILIZATION; num_events += 2; } strncpy( names[i], name, 64); } } return PAPI_OK; }
static int hwloc_nvml_discover(struct hwloc_backend *backend) { struct hwloc_topology *topology = backend->topology; nvmlReturn_t ret; unsigned nb, i; if (!(hwloc_topology_get_flags(topology) & (HWLOC_TOPOLOGY_FLAG_IO_DEVICES|HWLOC_TOPOLOGY_FLAG_WHOLE_IO))) return 0; if (!hwloc_topology_is_thissystem(topology)) { hwloc_debug("%s", "\nno NVML detection (not thissystem)\n"); return 0; } ret = nvmlInit(); if (NVML_SUCCESS != ret) return 0; ret = nvmlDeviceGetCount(&nb); if (NVML_SUCCESS != ret || !nb) { nvmlShutdown(); return 0; } for(i=0; i<nb; i++) { nvmlPciInfo_t pci; nvmlDevice_t device; hwloc_obj_t osdev, parent; char buffer[64]; ret = nvmlDeviceGetHandleByIndex(i, &device); assert(ret == NVML_SUCCESS); osdev = hwloc_alloc_setup_object(HWLOC_OBJ_OS_DEVICE, -1); snprintf(buffer, sizeof(buffer), "nvml%d", i); osdev->name = strdup(buffer); osdev->depth = (unsigned) HWLOC_TYPE_DEPTH_UNKNOWN; osdev->attr->osdev.type = HWLOC_OBJ_OSDEV_GPU; hwloc_obj_add_info(osdev, "Backend", "NVML"); hwloc_obj_add_info(osdev, "GPUVendor", "NVIDIA Corporation"); buffer[0] = '\0'; ret = nvmlDeviceGetName(device, buffer, sizeof(buffer)); hwloc_obj_add_info(osdev, "GPUModel", buffer); /* these may fail with NVML_ERROR_NOT_SUPPORTED on old devices */ buffer[0] = '\0'; ret = nvmlDeviceGetSerial(device, buffer, sizeof(buffer)); if (buffer[0] != '\0') hwloc_obj_add_info(osdev, "NVIDIASerial", buffer); buffer[0] = '\0'; ret = nvmlDeviceGetUUID(device, buffer, sizeof(buffer)); if (buffer[0] != '\0') hwloc_obj_add_info(osdev, "NVIDIAUUID", buffer); parent = NULL; if (NVML_SUCCESS == nvmlDeviceGetPciInfo(device, &pci)) { parent = hwloc_pci_belowroot_find_by_busid(topology, pci.domain, pci.bus, pci.device, 0); if (!parent) parent = hwloc_pci_find_busid_parent(topology, pci.domain, pci.bus, pci.device, 0); #if HAVE_DECL_NVMLDEVICEGETMAXPCIELINKGENERATION if (parent && parent->type == HWLOC_OBJ_PCI_DEVICE) { unsigned maxwidth = 0, maxgen = 0; float lanespeed; nvmlDeviceGetMaxPcieLinkWidth(device, &maxwidth); nvmlDeviceGetMaxPcieLinkGeneration(device, &maxgen); /* PCIe Gen1 = 2.5GT/s signal-rate per lane with 8/10 encoding = 0.25GB/s data-rate per lane * PCIe Gen2 = 5 GT/s signal-rate per lane with 8/10 encoding = 0.5 GB/s data-rate per lane * PCIe Gen3 = 8 GT/s signal-rate per lane with 128/130 encoding = 1 GB/s data-rate per lane */ lanespeed = maxgen <= 2 ? 2.5 * maxgen * 0.8 : 8.0 * 128/130; /* Gbit/s per lane */ if (lanespeed * maxwidth) /* we found the max link speed, replace the current link speed found by pci (or none) */ parent->attr->pcidev.linkspeed = lanespeed * maxwidth / 8; /* GB/s */ } #endif } if (!parent) parent = hwloc_get_root_obj(topology); hwloc_insert_object_by_parent(topology, parent, osdev); } nvmlShutdown(); return nb; }
static int get_process_info(unsigned int*ncores,unsigned int *valarray) { nvmlReturn_t ret; ret=nvmlInit(); if(ret!=NVML_SUCCESS) { fprintf(stderr,"ERROR:: Initialize NVML{%s}..\n",nvmlErrorString(ret)); return -1; } unsigned int c; ret=nvmlDeviceGetCount(&c); if(ret!=NVML_SUCCESS) { fprintf(stderr,"ERROR:: Device Get Count{%s}..\n",nvmlErrorString(ret)); return -1; } *ncores=c; /* if(c!=NDEV){ fprintf(stderr,"ERROR:: Current number of Cores is [%d],not %d....YOU NEED RECOMPILE THIS ROUTINE\n",c,NDEV); return -2; } */ nvmlDevice_t devs[NDEV]; nvmlProcessInfo_t pis[MAXPROC]; int i; for(i=0; i<c; i++) { ret=nvmlDeviceGetHandleByIndex(i,&devs[i]); if(ret!=NVML_SUCCESS) { fprintf(stderr,"ERROR:: Device Get Handle{%s}..\n",nvmlErrorString(ret)); return -1; } unsigned int np=MAXPROC; ret=nvmlDeviceGetComputeRunningProcesses(devs[i],&np,pis); if(ret!=NVML_SUCCESS) { fprintf(stderr,"ERROR:: GetRunningProcess{%s}..\n",nvmlErrorString(ret)); return -1; } valarray[i]=np; } ret=nvmlShutdown(); if(ret!=NVML_SUCCESS) { fprintf(stderr,"ERROR:: Shutdown NVML{%s}..\n",nvmlErrorString(ret)); return -1; } return 0; }
void CMeasureNVML<TSkipMs, TVariant>::init(void) { if(TVariant == VARIANT_FULL) { mrLog() << ">>> 'nvml' (full version)" << std::endl; } else { mrLog() << ">>> 'nvml' (light version)" << std::endl; } nvmlReturn_t result; int32_t rv; char const* args_set_pm[] = {"gpu_management", "-p 1", NULL}; uint32_t device_count; char name[NVML_DEVICE_NAME_BUFFER_SIZE]; nvmlPciInfo_t pci; nvmlEnableState_t mode; std::string modes[2] = {"disabled", "enabled"}; std::stringstream clk_gpu_str; std::stringstream clk_mem_str; nvmlPstates_t power_state; nvmlMemory_t memory; const uint32_t count = 32; uint32_t clk_mem_cnt = count; uint32_t clk_mem[count]; uint32_t clk_mem_max = 0; uint32_t clk_mem_min = 0xffffffff; uint32_t clk_mem_set = 0; uint32_t clk_gpu_min_arr_cnt = count; uint32_t clk_gpu_min_arr[clk_gpu_min_arr_cnt]; uint32_t clk_gpu_min = 0xffffffff; uint32_t clk_gpu_max_arr_cnt = count; uint32_t clk_gpu_max_arr[clk_gpu_max_arr_cnt]; uint32_t clk_gpu_max = 0; uint32_t clk_gpu_set = 0; uint32_t memory_total = 0; result = nvmlInit(); if (NVML_SUCCESS != result) { mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot initialize nvml library. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; exit(EXIT_FAILURE); } result = nvmlDeviceGetCount(&device_count); if (NVML_SUCCESS != result) { mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot query device count. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; exit(EXIT_FAILURE); } if (device_count > 1) { mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: this software has be rewritten if you want to support more than 1 device. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; exit(EXIT_FAILURE); } mrLog() << ">>> 'nvml' (thread main): get gpu device handler..."; mrLog.flush(); result = nvmlDeviceGetHandleByIndex(0, &mDevice); if (NVML_SUCCESS != result) { mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot get device handler. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; exit(EXIT_FAILURE); } mrLog() << " done!" << std::endl; result = nvmlDeviceGetName(mDevice, name, NVML_DEVICE_NAME_BUFFER_SIZE); if (NVML_SUCCESS != result) { mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot get device name. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; exit(EXIT_FAILURE); } result = nvmlDeviceGetPciInfo(mDevice, &pci); if (NVML_SUCCESS != result) { mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot get pci information. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; exit(EXIT_FAILURE); } result = nvmlDeviceGetPowerManagementMode(mDevice, &mode); if (NVML_SUCCESS != result) { mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: no power managment supported. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; exit(EXIT_FAILURE); } result = nvmlDeviceGetPerformanceState(mDevice, &power_state); if (NVML_SUCCESS != result) { mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: no performance state reading possible. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; exit(EXIT_FAILURE); } result = nvmlDeviceGetSupportedMemoryClocks(mDevice, &clk_mem_cnt, clk_mem); if (NVML_SUCCESS != result) { mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot obtain memory clock. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; exit(EXIT_FAILURE); } for (int i=0; i<(int32_t)clk_mem_cnt; ++i) { clk_mem_min = (clk_mem[i]<clk_mem_min) ? clk_mem[i] : clk_mem_min; clk_mem_max = (clk_mem[i]>clk_mem_max) ? clk_mem[i] : clk_mem_max; } result = nvmlDeviceGetSupportedGraphicsClocks(mDevice, clk_mem_min, &clk_gpu_min_arr_cnt, clk_gpu_min_arr); if (NVML_SUCCESS != result) { mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot obtain graphics clock. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; exit(EXIT_FAILURE); } for (int32_t i=0; i<(int32_t)clk_gpu_min_arr_cnt; ++i) { clk_gpu_min = (clk_gpu_min_arr[i]<clk_gpu_min) ? clk_gpu_min_arr[i] : clk_gpu_min; } result = nvmlDeviceGetSupportedGraphicsClocks(mDevice, clk_mem_max, &clk_gpu_max_arr_cnt, clk_gpu_max_arr); if (NVML_SUCCESS != result) { mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot obtain graphics clock. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; exit(EXIT_FAILURE); } for (int32_t i=0; i<(int32_t)clk_gpu_max_arr_cnt; ++i) { clk_gpu_max = (clk_gpu_max_arr[i]>clk_gpu_max) ? clk_gpu_max_arr[i] : clk_gpu_max; } result = nvmlDeviceGetMemoryInfo(mDevice, &memory); if (NVML_SUCCESS != result) { mrLog.lock(); mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread main): Error: cannot obtain memory informations. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; mrLog.unlock(); exit(EXIT_FAILURE); } memory_total = (uint32_t)(memory.total >> 20); rv = exec_gpu_mgmt((char**)args_set_pm); if (rv) { mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: in gpu_management tool. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; exit(EXIT_FAILURE); } mrLog() << ">>> 'nvml' (thread main): persistence mode enabled." << std::endl; mrLog() << ">>> 'nvml' (thread main):" << std::endl << " device : " << name << std::endl << " pcie : " << pci.busId << std::endl << " power mgmt mode: " << modes[mode] << std::endl << " power state cur: " << power_state << std::endl << " power state min: " << NVML_PSTATE_15 << std::endl << " power state max: " << NVML_PSTATE_0 << std::endl << " memory total : " << memory_total << " MiB" << std::endl << " avail mem clks : "; for (int i=0; i<(int32_t)clk_mem_cnt; ++i) { if (i<(int32_t)clk_mem_cnt-1) { mrLog() << clk_mem[i] << " MHz, "; } else { mrLog() << clk_mem[i] << " MHz" << std::endl; } } mrLog() << " memory clk min : " << clk_mem_min << " MHz" << std::endl << " avail core clks: "; for (int32_t i=0; i<(int32_t)clk_gpu_min_arr_cnt; ++i) { if (i<(int32_t)clk_gpu_min_arr_cnt-1) { mrLog() << clk_gpu_min_arr[i] << " MHz, "; } else { mrLog() << clk_gpu_min_arr[i] << " MHz" << std::endl; } } mrLog() << " core clk min : " << clk_gpu_min << " MHz" << std::endl; mrLog() << " memory clk max : " << clk_mem_max << " MHz" << std::endl << " avail core clks: "; for (int32_t i=0; i<(int32_t)clk_gpu_max_arr_cnt; ++i) { if (i<(int32_t)clk_gpu_max_arr_cnt-1) { mrLog() << clk_gpu_max_arr[i] << " MHz, "; } else { mrLog() << clk_gpu_max_arr[i] << " MHz" << std::endl; } } mrLog() << " core clk max : " << clk_gpu_max << " MHz" << std::endl; switch (mGpuFrequency) { case GPU_FREQUENCY_MIN: clk_mem_set = clk_mem_min; clk_gpu_set = clk_gpu_min; break; case GPU_FREQUENCY_MAX: clk_mem_set = clk_mem_max; clk_gpu_set = clk_gpu_max; break; case GPU_FREQUENCY_CUR: default: clk_mem_set = 0; clk_gpu_set = 0; result = nvmlDeviceGetClockInfo(mDevice, NVML_CLOCK_MEM, &clk_mem_set); if (NVML_SUCCESS != result) { mrLog.lock(); mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread main): Error: cannot read frequency. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; mrLog.unlock(); exit(EXIT_FAILURE); } result = nvmlDeviceGetClockInfo(mDevice, NVML_CLOCK_GRAPHICS, &clk_gpu_set); if (NVML_SUCCESS != result) { mrLog.lock(); mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread main): Error: cannot read frequency. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; mrLog.unlock(); exit(EXIT_FAILURE); } break; } if (mGpuFrequency == GPU_FREQUENCY_MIN || mGpuFrequency == GPU_FREQUENCY_MAX) { // In these cases we actually set the GPU frequencies either to the maximum or minimum value. clk_gpu_str << "-c " << clk_gpu_set; clk_mem_str << "-m " << clk_mem_set; char const* args_set_clk[] = {"gpu_management", clk_gpu_str.str().c_str() , clk_mem_str.str().c_str(), NULL}; rv = exec_gpu_mgmt((char**)args_set_clk); if (rv) { mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: in gpu_management tool. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl; exit(EXIT_FAILURE); } mrLog() << ">>> 'nvml' (thread main): set core clk to " << clk_gpu_set << " MHz and mem clk to " << clk_mem_set << " MHz." << std::endl; } else { // We name the values *_set, but we don't set the frequency. We just print the current GPU frequency. mrLog() << ">>> 'nvml' (thread main): current core clk is " << clk_gpu_set << " MHz and mem clk is " << clk_mem_set << " MHz." << std::endl; } mrLog() << ">>> 'nvml' (thread main): wait for 15s to throttle gpu clocks." << std::endl; sleep(15); mrLog() << ">>> 'nvml' (thread main): initialization done." << std::endl << std::endl; }
int Machine::initializeNVIDIADevices(hwloc_obj_t machine_obj, hwloc_topology_t topology) { nvmlReturn_t rc; /* Initialize the NVML handle. * * nvmlInit should be called once before invoking any other methods in the NVML library. * A reference count of the number of initializations is maintained. Shutdown only occurs * when the reference count reaches zero. * */ rc = nvmlInit(); if (rc != NVML_SUCCESS && rc != NVML_ERROR_ALREADY_INITIALIZED) { log_nvml_error(rc, NULL, __func__); return(PBSE_NONE); } unsigned int device_count = 0; /* Get the device count. */ rc = nvmlDeviceGetCount(&device_count); if (rc == NVML_SUCCESS) { nvmlDevice_t gpu; /* Get the nvml device handle at each index */ for (unsigned int idx = 0; idx < device_count; idx++) { rc = nvmlDeviceGetHandleByIndex(idx, &gpu); if (rc != NVML_SUCCESS) { /* TODO: get gpuid from nvmlDevice_t struct */ log_nvml_error(rc, NULL, __func__); } /* Use the hwloc library to determine device locality */ hwloc_obj_t gpu_obj; hwloc_obj_t ancestor_obj; int is_in_tree; gpu_obj = hwloc_nvml_get_device_osdev(topology, gpu); if (gpu_obj == NULL) { /* This was not an nvml device. We will look for a "card" device (GeForce or Quadra) */ gpu_obj = this->get_non_nvml_device(topology, gpu); if (gpu_obj == NULL) continue; } /* The ancestor was not a numa chip. Is it the machine? */ ancestor_obj = hwloc_get_ancestor_obj_by_type(topology, HWLOC_OBJ_MACHINE, gpu_obj); if (ancestor_obj != NULL) { PCI_Device new_device; new_device.initializePCIDevice(gpu_obj, idx, topology); store_device_on_appropriate_chip(new_device); } } } else { log_nvml_error(rc, NULL, __func__); } /* Shutdown the NVML handle. * * nvmlShutdown should be called after NVML work is done, once for each call to nvmlInit() * A reference count of the number of initializations is maintained. Shutdown only occurs when * the reference count reaches zero. For backwards compatibility, no error is reported if * nvmlShutdown() is called more times than nvmlInit(). * */ rc = nvmlShutdown(); if (rc != NVML_SUCCESS) { log_nvml_error(rc, NULL, __func__); } return(PBSE_NONE); }
int main() { nvmlReturn_t result; unsigned int device_count, i; // First initialize NVML library result = nvmlInit(); if (NVML_SUCCESS != result) { printf("Failed to initialize NVML: %s\n", nvmlErrorString(result)); printf("Press ENTER to continue...\n"); getchar(); return 1; } result = nvmlDeviceGetCount(&device_count); if (NVML_SUCCESS != result) { printf("Failed to query device count: %s\n", nvmlErrorString(result)); goto Error; } printf("Found %d device%s\n\n", device_count, device_count != 1 ? "s" : ""); printf("Listing devices:\n"); for (i = 0; i < device_count; i++) { nvmlDevice_t device; char name[NVML_DEVICE_NAME_BUFFER_SIZE]; nvmlPciInfo_t pci; nvmlComputeMode_t compute_mode; // Query for device handle to perform operations on a device // You can also query device handle by other features like: // nvmlDeviceGetHandleBySerial // nvmlDeviceGetHandleByPciBusId result = nvmlDeviceGetHandleByIndex(i, &device); if (NVML_SUCCESS != result) { printf("Failed to get handle for device %i: %s\n", i, nvmlErrorString(result)); goto Error; } result = nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE); if (NVML_SUCCESS != result) { printf("Failed to get name of device %i: %s\n", i, nvmlErrorString(result)); goto Error; } // pci.busId is very useful to know which device physically you're talking to // Using PCI identifier you can also match nvmlDevice handle to CUDA device. result = nvmlDeviceGetPciInfo(device, &pci); if (NVML_SUCCESS != result) { printf("Failed to get pci info for device %i: %s\n", i, nvmlErrorString(result)); goto Error; } printf("%d. %s [%s]\n", i, name, pci.busId); // This is a simple example on how you can modify GPU's state result = nvmlDeviceGetComputeMode(device, &compute_mode); if (NVML_ERROR_NOT_SUPPORTED == result) printf("\t This is not CUDA capable device\n"); else if (NVML_SUCCESS != result) { printf("Failed to get compute mode for device %i: %s\n", i, nvmlErrorString(result)); goto Error; } else { // try to change compute mode printf("\t Changing device's compute mode from '%s' to '%s'\n", convertToComputeModeString(compute_mode), convertToComputeModeString(NVML_COMPUTEMODE_PROHIBITED)); result = nvmlDeviceSetComputeMode(device, NVML_COMPUTEMODE_PROHIBITED); if (NVML_ERROR_NO_PERMISSION == result) printf("\t\t Need root privileges to do that: %s\n", nvmlErrorString(result)); else if (NVML_ERROR_NOT_SUPPORTED == result) printf("\t\t Compute mode prohibited not supported. You might be running on\n" "\t\t windows in WDDM driver model or on non-CUDA capable GPU.\n"); else if (NVML_SUCCESS != result) { printf("\t\t Failed to set compute mode for device %i: %s\n", i, nvmlErrorString(result)); goto Error; } else { printf("\t Restoring device's compute mode back to '%s'\n", convertToComputeModeString(compute_mode)); result = nvmlDeviceSetComputeMode(device, compute_mode); if (NVML_SUCCESS != result) { printf("\t\t Failed to restore compute mode for device %i: %s\n", i, nvmlErrorString(result)); goto Error; } } } } result = nvmlShutdown(); if (NVML_SUCCESS != result) printf("Failed to shutdown NVML: %s\n", nvmlErrorString(result)); printf("All done.\n"); printf("Press ENTER to continue...\n"); getchar(); return 0; Error: result = nvmlShutdown(); if (NVML_SUCCESS != result) printf("Failed to shutdown NVML: %s\n", nvmlErrorString(result)); printf("Press ENTER to continue...\n"); getchar(); return 1; }
int main(int argc,char* argv[]){ /**Initialize signal**/ signal(SIGINT ,_end_server); signal(SIGUSR1,_end_server); /**Initialize struct proc**/ init_proc(); init_cons(); /**Process becomes dem**/ pid_t process_id = 0; pid_t sid = 0; if(argc >= 2){ process_id = fork(); if(process_id < 0){ printf("fork failed ..\n"); exit(1); } if(process_id > 0){ exit(0); } umask(0); sid = setsid(); if(sid < 0){ exit(1); } close(STDIN_FILENO); close(STDOUT_FILENO); close(STDERR_FILENO); }else{ sid = getpid(); } /**Setup the log file**/ char log[32]; sprintf(log,"log.%u",sid); // fp = fopen(log,"w+"); /**Start Initialize nvidia management library from Here!!**/ nvmlReturn_t nres; int i; nres = nvmlInit(); if(nres != NVML_SUCCESS){ perror("Failed to initialize Nvidia Managerment Library...\n"); exit(-1); } nres = nvmlDeviceGetCount(&dem.ndev); if(nres != NVML_SUCCESS){ perror("Failed to get num of device...\n"); exit(-1); } dem.devs = (nvmlDevice_t*)malloc(sizeof(nvmlDevice_t)*dem.ndev); dem.flags = (dflag*)malloc(sizeof(dflag)*dem.ndev); MAXPROC = dem.ndev * 4; for(i = 0 ; i < dem.ndev ; i ++){ nres = nvmlDeviceGetHandleByIndex(i,&dem.devs[i]); if(nres != NVML_SUCCESS){ perror("Failed to get device handle\n"); exit(-1); } dem.flags[i].sd = -1; dem.flags[i].flag = 0; dem.flags[i].stayed = 0; dem.flags[i].reserved = 0; } dem.procCounter = 0; /**Setup the socket**/ int len,rc,on = 1; int listen_sd,max_sd,new_sd; int desc_ready; int close_conn; struct sockaddr_un addr; struct timeval timeout; fd_set master_set,working_set; listen_sd = socket(AF_UNIX,SOCK_STREAM,0); if(listen_sd < 0){ perror("socket() failed\n"); exit(-1); } rc = setsockopt(listen_sd, SOL_SOCKET, SO_REUSEADDR, (char*)&on,sizeof(on)); if(rc < 0){ perror("setsockopt() failed\n"); exit(-1); } unlink("mocu_server"); memset(&addr,0,sizeof(addr)); addr.sun_family = AF_UNIX; strcpy(addr.sun_path,"mocu_server"); rc = bind(listen_sd,(struct sockaddr*)&addr,sizeof(addr)); if(rc < 0){ perror("bind() failed"); close(listen_sd); exit(-1); } rc = listen(listen_sd,SOMAXCONN); if(rc < 0){ perror("listen() failed"); close(listen_sd); exit(-1); } FD_ZERO(&master_set); max_sd = listen_sd; FD_SET(listen_sd,&master_set); timeout.tv_sec = 3*60; timeout.tv_usec = 0; long counter = 0; /**Entering main loop**/ proc_data* receivedProc = (proc_data*)malloc(sizeof(proc_data)); mocu_check(); do{ memcpy(&working_set,&master_set,sizeof(master_set)); rc = select(max_sd+1, &working_set, NULL, NULL, NULL); if(rc < 0){ perror("select() failed\n"); break; } if(rc == 0){ printf("select() time out. End program.\n"); break; } desc_ready = rc; for(i = 0 ; i < max_sd+1 && desc_ready > 0 ; ++i){ if(FD_ISSET(i,&working_set)){ desc_ready = -1; if(i == listen_sd){ new_sd = accept(listen_sd,NULL,NULL); if(new_sd < 0){ printf("accept() failed"); end_server = TRUE; } FD_SET(new_sd,&master_set); if(new_sd > max_sd){ max_sd = new_sd; } }else{ rc = recv(i,receivedProc,sizeof(proc_data),0); if(rc <= 0){ FD_CLR(i,&master_set); _FIN(i); }else{ if(receivedProc->REQUEST == CONNECT){ _CONNECT(i,receivedProc); }else if(receivedProc->REQUEST == RENEW){ _RENEW(i,receivedProc); }else if(receivedProc->REQUEST == MIGDONE){ _MIGDONE(i,receivedProc); }else if(receivedProc->REQUEST == CANRECEIVE){ _CANRECEIVE(i,receivedProc); }else if(receivedProc->REQUEST == FAILEDTOALLOC){ _FAILEDTOALLOC(i,receivedProc); exit(-1);//TEST }else if(receivedProc->REQUEST == MALLOCDONE){ _MALLOCDONE(i,receivedProc); }else if(receivedProc->REQUEST == CUDAMALLOC){ _CUDAMALLOC(i,receivedProc); }else if(receivedProc->REQUEST == BACKUPED){ _BACKUPED(i,receivedProc); }else if(receivedProc->REQUEST == CONTEXT_CHECK){ _CONTEXT_CHECK(i,receivedProc); }else if(receivedProc->REQUEST == CREATE_CONTEXT){ _CREATE_CONTEXT(i); }else if(receivedProc->REQUEST == CONSOLE){ _CONSOLE(i); }else{ printf("Unkown request...\n"); exit(-1); } } } } } mocu_check(); }while(end_server == FALSE); int closed = 0; for(i = 0 ; i < max_sd ; i ++){ if(FD_ISSET(i,&master_set)){ close(i); closed = 1; } } // fclose(fp); return 0; }
int readNvmlCounters(HSP *sp, SFLHost_gpu_nvml *nvml) { unsigned int i; if(sp->nvml.gpu_count == 0) { return NO; } // pick up latest value of accumulators nvml->gpu_time = sp->nvml.nvml_gpu_time; nvml->mem_time = sp->nvml.nvml_mem_time; nvml->energy = sp->nvml.nvml_energy; // and fill in the rest of the counters/gauges too nvml->device_count = sp->nvml.gpu_count; // zero these, and sum across all GPUs nvml->mem_total = 0; nvml->mem_free = 0; nvml->ecc_errors = 0; nvml->processes = 0; // use the max across all GPUs nvml->temperature = 0; nvml->fan_speed = 0; for (i = 0; i < sp->nvml.gpu_count; ++i) { unsigned long long eccErrors; unsigned int temp; nvmlDevice_t gpu; unsigned int speed; unsigned int procs; nvmlMemory_t memInfo; nvmlReturn_t result; if (NVML_SUCCESS != nvmlDeviceGetHandleByIndex(i, &gpu)) { return NO; } if (NVML_SUCCESS == nvmlDeviceGetMemoryInfo(gpu, &memInfo)) { nvml->mem_total += memInfo.total; nvml->mem_free += memInfo.free; } if (NVML_SUCCESS == nvmlDeviceGetTotalEccErrors(gpu, NVML_SINGLE_BIT_ECC, NVML_VOLATILE_ECC, &eccErrors)) { nvml->ecc_errors += eccErrors; } if (NVML_SUCCESS == nvmlDeviceGetTotalEccErrors(gpu, NVML_DOUBLE_BIT_ECC, NVML_VOLATILE_ECC, &eccErrors)) { nvml->ecc_errors += eccErrors; } if (NVML_SUCCESS == nvmlDeviceGetTemperature(gpu, NVML_TEMPERATURE_GPU, &temp)) { if (nvml->temperature < temp) { nvml->temperature = temp; } } if (NVML_SUCCESS == nvmlDeviceGetFanSpeed(gpu, &speed)) { if (nvml->fan_speed < speed) { nvml->fan_speed = speed; } } result = nvmlDeviceGetComputeRunningProcesses(gpu, &procs, NULL); if (NVML_SUCCESS == result || NVML_ERROR_INSUFFICIENT_SIZE == result) { nvml->processes += procs; } } return YES; }