Ejemplo n.º 1
0
int probe_gpustats(devstat**stats)
{

    unsigned int n_dev;
    nvmlReturn_t nvret;


    nvret=nvmlInit();
    CHK_NVML(nvret,"Init NVML");


    nvret=nvmlDeviceGetCount(&n_dev);
    CHK_NVML(nvret,"getCount");


    *stats=(devstat*)calloc(n_dev,sizeof(devstat));
    devstat*pstats=*stats;


    int i;
    for(i=0;i<n_dev;i++)
        nvmlDeviceGetHandleByIndex(i,&pstats[i].handler);

    
    for(i=0;i<n_dev;i++)
        nvmlDeviceGetMemoryInfo(pstats[i].handler,&pstats[i].meminfo);
    
    for(i=0;i<n_dev;i++)
        nvmlDeviceGetUtilizationRates(pstats[i].handler,&pstats[i].utils);

    unsigned int sampp;
    for(i=0;i<n_dev;i++)
        nvmlDeviceGetEncoderUtilization(pstats[i].handler,&pstats[i].encutil,&sampp);

    for(i=0;i<n_dev;i++)
        nvmlDeviceGetDecoderUtilization(pstats[i].handler,&pstats[i].decutil,&sampp);
#if 0
    int maxfreeind=0;
    int maxfree=0;
    for(i=0;i<n_dev;i++){

        print_devstats(&pstats[i]);

        int free=pstats[i].meminfo.free; 
//        fprintf(stderr,"<%d\n",free);
        if(free>maxfree){
            maxfree=free;
            maxfreeind=i;
        }

    }
#endif
    nvret=nvmlShutdown();
    CHK_NVML(nvret,"Shutdown NVML");


    return n_dev;
}
Ejemplo n.º 2
0
static int get_mem_info(unsigned int*ncores,unsigned int*usedarray)
{

    nvmlReturn_t ret;
    ret=nvmlInit();


    if(ret!=NVML_SUCCESS) {
        fprintf(stderr,"ERROR:: Initialize NVML{%s}..\n",nvmlErrorString(ret));
        return -1;
    }


    unsigned int c;

    ret=nvmlDeviceGetCount(&c);
    if(ret!=NVML_SUCCESS) {
        fprintf(stderr,"ERROR:: Device Get Count{%s}..\n",nvmlErrorString(ret));
        return -1;
    }

    *ncores=c;

    nvmlDevice_t devs[NDEV];
    nvmlMemory_t meminfo;


    int i;
    for(i=0; i<c; i++) {

        ret=nvmlDeviceGetHandleByIndex(i,&devs[i]);
        if(ret!=NVML_SUCCESS) {
            fprintf(stderr,"ERROR:: Device Get Handle{%s}..\n",nvmlErrorString(ret));
            return -1;
        }

        ret=nvmlDeviceGetMemoryInfo(devs[i],&meminfo);
        if(ret!=NVML_SUCCESS) {
            fprintf(stderr,"ERROR:: GetMemoryInfo{%s}..\n",nvmlErrorString(ret));
            return -1;
        }
        usedarray[i]=meminfo.used;

    }

    ret=nvmlShutdown();

    if(ret!=NVML_SUCCESS) {
        fprintf(stderr,"ERROR:: Shutdown NVML{%s}..\n",nvmlErrorString(ret));
        return -1;
    }

    return 0;

}
Ejemplo n.º 3
0
static void init_device_info(struct monitor* mon)
{
  gethostname(mon->hostname, 64);

  NVML_TRY(nvmlSystemGetDriverVersion(mon->driver_version,
                                      sizeof(mon->driver_version)));
  NVML_TRY(nvmlSystemGetNVMLVersion(mon->nvml_version,
                                    sizeof(mon->nvml_version)));

  NVML_TRY(nvmlDeviceGetCount(&mon->dev_count));

  mon->devices = calloc(mon->dev_count, sizeof(struct device));

  for(unsigned i = 0; i < mon->dev_count; ++i) {
    struct device dev;
    memset(&dev, 0, sizeof(struct device));

    dev.index = i;

    NVML_TRY(nvmlDeviceGetHandleByIndex(i, &dev.handle));

    NVML_TRY(nvmlDeviceGetName(dev.handle, dev.name, sizeof(dev.name)));
    NVML_TRY(nvmlDeviceGetSerial(dev.handle, dev.serial, sizeof(dev.serial)));
    NVML_TRY(nvmlDeviceGetUUID(dev.handle, dev.uuid, sizeof(dev.uuid)));

    NVML_TRY(nvmlDeviceGetPciInfo(dev.handle, &dev.pci));
    NVML_TRY(nvmlDeviceGetMemoryInfo(dev.handle, &dev.memory));

    unsigned long long event_types;
    NVML_TRY(nvmlEventSetCreate(&dev.event_set));
    if(0 == NVML_TRY(nvmlDeviceGetSupportedEventTypes(dev.handle, &event_types))) {
      NVML_TRY(nvmlDeviceRegisterEvents(dev.handle, event_types, dev.event_set));
    } else {
      dev.event_set = NULL;
    }

    for(nvmlClockType_t type = NVML_CLOCK_GRAPHICS; type < NVML_CLOCK_COUNT;
        ++type) {
      if(NVML_TRY(nvmlDeviceGetMaxClockInfo(dev.handle, type,
                                            &dev.max_clock[type])))
        break;
    }

    get_device_features(&dev);

    mon->devices[i] = dev;
  }

  mon->last_update = time(NULL);
}
Ejemplo n.º 4
0
// NVIDIA NVML library function wrapper for GPU DVFS.
int SetGPUFreq(unsigned int clock_mem, unsigned int clock_core) {
    nvmlDevice_t device;//int device;
    nvmlReturn_t result;
    result = nvmlInit();
    result = nvmlDeviceGetHandleByIndex(0, &device);//cudaGetDevice(&device);
    result = nvmlDeviceSetApplicationsClocks(device, clock_mem, clock_core);//(nvmlDevice_t)device
    if(result != NVML_SUCCESS)
    {
        printf("Failed to set GPU core and memory frequencies: %s\n", nvmlErrorString(result));
        return 1;
    }
    else
    {
        nvmlDeviceGetApplicationsClock(device, NVML_CLOCK_GRAPHICS, &clock_core);
        nvmlDeviceGetApplicationsClock(device, NVML_CLOCK_MEM, &clock_mem);
        ////printf("GPU core frequency is now set to %d MHz; GPU memory frequency is now set to %d MHz", clock_core, clock_mem);
        return 0;
    }
}
Ejemplo n.º 5
0
void update_temperature(void)
{
#if (ENABLE_NVML==1)
    unsigned int deviceCount;
    NVML_CHECK(nvmlDeviceGetCount( &deviceCount ));

    for( unsigned int devIdx = 0; devIdx < deviceCount; ++devIdx )
    {
        nvmlDevice_t devHandle;
        NVML_CHECK(nvmlDeviceGetHandleByIndex( devIdx, &devHandle ));

        unsigned int devTemperature;
        NVML_CHECK(nvmlDeviceGetTemperature( devHandle, NVML_TEMPERATURE_GPU, &devTemperature ));
        gpu_temp[devIdx] = devTemperature;

        DEBUG_PRINTF("temperature updated: (gpu %d) %d \n", devIdx, devTemperature);
    }
#endif
}
Ejemplo n.º 6
0
// set the CPU affinity for this GPU
void setCpuAffinity(unsigned int rank) {
  std::lock_guard<std::mutex> lock(NVMLInit::m_);
  static thread_local NVMLInit nvml_init_;
  bool result = false;
  unsigned int deviceCount = 0U;
  const std::vector<int>& gpus = Caffe::gpus();
  if (nvmlDeviceGetCount(&deviceCount) == NVML_SUCCESS) {
    CHECK_LT(rank, deviceCount);
    if (rank < deviceCount && rank < gpus.size() &&
        nvmlDeviceGetHandleByIndex(gpus[rank], &nvml_init_.device_) == NVML_SUCCESS) {
      if (nvmlDeviceSetCpuAffinity(nvml_init_.device_) == NVML_SUCCESS) {
        LOG(INFO) << "NVML succeeded to set CPU affinity on device " << gpus[rank];
        result = true;
      }
    }
  }
  if (!result && rank < gpus.size()) {
    LOG(ERROR) << "NVML failed to set CPU affinity on device " << gpus[rank];
  }
}
Ejemplo n.º 7
0
void PCI_Device::initializeGpu(

  int              idx,
  hwloc_topology_t topology)

  {
  int rc;
  nvmlDevice_t  gpu_device;
  
  id = idx;
  rc = nvmlDeviceGetHandleByIndex(idx, &gpu_device);
  if (rc != NVML_SUCCESS)
    {
    string buf;

    buf = "nvmlDeviceGetHandleByIndex failed for nvidia gpus";
    buf = buf + name.c_str();
    log_err(-1, __func__, buf.c_str());
    }
  else
    {
    nearest_cpuset = hwloc_bitmap_alloc();
    if (nearest_cpuset != NULL)
      {
      rc = hwloc_nvml_get_device_cpuset(topology, gpu_device, nearest_cpuset);
      if (rc != 0)
        {
        string  buf;

        buf = "could not get cpuset of ";
        buf = buf + name.c_str();
        log_err(-1, __func__, buf.c_str());
        }

      hwloc_bitmap_list_snprintf(cpuset_string, MAX_CPUSET_SIZE, nearest_cpuset);
      }
    }

  this->type = GPU;

  }
Ejemplo n.º 8
0
  /*_________________---------------------------__________________
    _________________     nvml_tick             __________________
    -----------------___________________________------------------
    Called every second
  */
  void nvml_tick(HSP *sp) {
    if(sp->nvml.gpu_count > 0) {
      unsigned int i;

      for (i = 0; i < sp->nvml.gpu_count; ++i) {
        nvmlDevice_t gpu;
        unsigned int power_mW;
	nvmlUtilization_t util;

        if (NVML_SUCCESS != nvmlDeviceGetHandleByIndex(i, &gpu)) {
          continue;
        }
        if (NVML_SUCCESS == nvmlDeviceGetUtilizationRates(gpu, &util)) {
	  sp->nvml.nvml_gpu_time += util.gpu * 10; // accumulate as mS
	  sp->nvml.nvml_mem_time += util.memory * 10; // accumulate as mS
        }
        if (NVML_SUCCESS == nvmlDeviceGetPowerUsage(gpu, &power_mW)) {
	  sp->nvml.nvml_energy += power_mW; // accumulate as mJ
        }
      }

    }
  }
Ejemplo n.º 9
0
/*
 * Class:     org_apache_hadoop_yarn_server_nodemanager_containermanager_launcher_GPUMonitor
 * Method:    initnvml
 * Signature: ()Ljava/lang/String;
 */
JNIEXPORT jstring JNICALL Java_org_apache_hadoop_yarn_server_nodemanager_containermanager_launcher_GPUMonitor_initnvml
  (JNIEnv *env, jobject)
{
    nvmlReturn_t result;
    unsigned int device_count, i;
	char sentence[200];
	std::string err = "";

    result = nvmlInit();
    if (NVML_SUCCESS != result) { 
        printf("Failed to initialize NVML: %s\n", nvmlErrorString(result));
        sprintf(sentence, "Failed to initialize NVML: %s\n", nvmlErrorString(result));
		err.append( (std::string)sentence );
    }
	char name[NVML_DEVICE_NAME_BUFFER_SIZE];
	result = nvmlDeviceGetHandleByIndex(0, &device);
	if (NVML_SUCCESS != result) { 
		printf("Failed to get handle for device %i: %s\n", i, nvmlErrorString(result));
		sprintf(sentence,"Failed to get handle for device %i: %s\n", i, nvmlErrorString(result));
		err.append( (std::string)sentence );
		result = nvmlShutdown();
		return 0;
	}
	result = nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
	if (NVML_SUCCESS != result) { 
		printf("Failed to get name of device %i: %s\n", i, nvmlErrorString(result));
		sprintf(sentence,"Failed to get name of device %i: %s\n", i, nvmlErrorString(result));
		err.append( (std::string)sentence );
		result = nvmlShutdown();
		return 0;
	}
	printf("Device : %s\n",name);
	sprintf(sentence,"Device : %s\n",name);
	err.append( (std::string)sentence );
	return env->NewStringUTF( err.c_str() );
}
Ejemplo n.º 10
0
void get_serial_number(unsigned int devIdx, char* serial)
{
#if (ENABLE_NVML==1)
    try
    {
        nvmlDevice_t devHandle;
        NVML_CHECK(nvmlDeviceGetHandleByIndex( devIdx, &devHandle ));

        unsigned int serialLength = NVML_DEVICE_SERIAL_BUFFER_SIZE;
        NVML_CHECK(nvmlDeviceGetSerial( devHandle, serial, serialLength ));
    }
    catch(const std::runtime_error& e)
    {
        std::strncpy(
            serial,
            "unknown (NVML runtime error)",
            NVML_DEVICE_SERIAL_BUFFER_SIZE);
        serial[NVML_DEVICE_SERIAL_BUFFER_SIZE-1] = '\0';
    }
#else
    (void)(devIdx);
    (void)(serial);
#endif
}
Ejemplo n.º 11
0
		static int 
detectDevices( ) 
{
		nvmlReturn_t ret;
		nvmlEnableState_t mode = NVML_FEATURE_DISABLED;
		nvmlDevice_t handle;
		nvmlPciInfo_t info;

		cudaError_t cuerr;

		char busId[16];
		char name[64];
		char inforomECC[16];
		char inforomPower[16];
		char names[device_count][64];
		char nvml_busIds[device_count][16];

		float ecc_version = 0.0, power_version = 0.0;

		int i = 0,
			j = 0;
		int isTesla = 0;
		int isFermi	= 0;
		int isUnique = 1;

		unsigned int temp = 0;


		/* list of nvml pci_busids */
	for (i=0; i < device_count; i++) {
		ret = nvmlDeviceGetHandleByIndex( i, &handle );	
		if ( NVML_SUCCESS != ret ) {
			SUBDBG("nvmlDeviceGetHandleByIndex(%d) failed\n", i);
			return PAPI_ESYS;
		}

		ret = nvmlDeviceGetPciInfo( handle, &info );
		if ( NVML_SUCCESS != ret ) {
			SUBDBG("nvmlDeviceGetPciInfo() failed %s\n", nvmlErrorString(ret) );
			return PAPI_ESYS;
		}

		strncpy(nvml_busIds[i], info.busId, 16);
		
	}

	/* We want to key our list of nvmlDevice_ts by each device's cuda index */
	for (i=0; i < device_count; i++) {
			cuerr = cudaDeviceGetPCIBusId( busId, 16, i );
			if ( CUDA_SUCCESS != cuerr ) {
				SUBDBG("cudaDeviceGetPCIBusId failed.\n");
				return PAPI_ESYS;
			}
			for (j=0; j < device_count; j++ ) {
					if ( !strncmp( busId, nvml_busIds[j], 16) ) {
							ret = nvmlDeviceGetHandleByIndex(j, &devices[i] );
							if ( NVML_SUCCESS != ret )
								SUBDBG("nvmlDeviceGetHandleByIndex(%d, &devices[%d]) failed.\n", j, i);
								return PAPI_ESYS;
							break;
					}
			}	
	}

		memset(names, 0x0, device_count*64);
		/* So for each card, check whats querable */
		for (i=0; i < device_count; i++ ) {
				isTesla=0;
				isFermi=1;
				isUnique = 1;
				features[i] = 0;

				ret = nvmlDeviceGetName( devices[i], name, 64 );
				if ( NVML_SUCCESS != ret) {
					SUBDBG("nvmlDeviceGetName failed \n");
					return PAPI_ESYS;
				}

				for (j=0; j < i; j++ ) 
						if ( 0 == strncmp( name, names[j], 64 ) ) {
								/* if we have a match, and IF everything is sane, 
								 * devices with the same name eg Tesla C2075 share features */
								isUnique = 0;
								features[i] = features[j];

						}

				if ( isUnique ) {
						ret = nvmlDeviceGetInforomVersion( devices[i], NVML_INFOROM_ECC, inforomECC, 16);
						if ( NVML_SUCCESS != ret ) {
								SUBDBG("nvmlGetInforomVersion carps %s\n", nvmlErrorString(ret ) );
								isFermi = 0;
						}
						ret = nvmlDeviceGetInforomVersion( devices[i], NVML_INFOROM_POWER, inforomPower, 16);
						if ( NVML_SUCCESS != ret ) {
								/* This implies the card is older then Fermi */
								SUBDBG("nvmlGetInforomVersion carps %s\n", nvmlErrorString(ret ) );
								SUBDBG("Based upon the return to nvmlGetInforomVersion, we conclude this card is older then Fermi.\n");
								isFermi = 0;
						} 

						ecc_version = strtof(inforomECC, NULL );
						power_version = strtof( inforomPower, NULL);

						ret = nvmlDeviceGetName( devices[i], name, 64 );
						isTesla = ( NULL == strstr(name, "Tesla") ) ? 0:1;

						/* For Tesla and Quadro products from Fermi and Kepler families. */
						if ( isFermi ) {
								features[i] |= FEATURE_CLOCK_INFO;
								num_events += 3;
						}

						/* 	For Tesla and Quadro products from Fermi and Kepler families. 
							requires NVML_INFOROM_ECC 2.0 or higher for location-based counts
							requires NVML_INFOROM_ECC 1.0 or higher for all other ECC counts
							requires ECC mode to be enabled. */
						if ( isFermi ) {
								ret = nvmlDeviceGetEccMode( devices[i], &mode, NULL );
								if ( NVML_FEATURE_ENABLED == mode) {
										if ( ecc_version >= 2.0 ) {
												features[i] |= FEATURE_ECC_LOCAL_ERRORS;
												num_events += 8; /* {single bit, two bit errors} x { reg, l1, l2, memory } */
										} 
										if ( ecc_version >= 1.0 ) {
												features[i] |= FEATURE_ECC_TOTAL_ERRORS;
												num_events += 2; /* single bit errors, double bit errors */
										}
								}	
						}

						/* For all discrete products with dedicated fans */
						features[i] |= FEATURE_FAN_SPEED;
						num_events++;

						/* For Tesla and Quadro products from Fermi and Kepler families. */
						if ( isFermi ) {
								features[i] |= FEATURE_MAX_CLOCK;
								num_events += 3;
						}

						/* For all products */
						features[i] |= FEATURE_MEMORY_INFO;
						num_events += 3; /* total, free, used */

						/* For Tesla and Quadro products from the Fermi and Kepler families. */
						if ( isFermi ) {
								features[i] |= FEATURE_PERF_STATES;
								num_events++;
						}

						/* 	For "GF11x" Tesla and Quadro products from the Fermi family
							requires NVML_INFOROM_POWER 3.0 or higher
							For Tesla and Quadro products from the Kepler family
							does not require NVML_INFOROM_POWER */
						if ( isFermi ) {
								ret = nvmlDeviceGetPowerUsage( devices[i], &temp);
								if ( NVML_SUCCESS == ret ) {
										features[i] |= FEATURE_POWER;
										num_events++;
								}
						}

						/* For all discrete and S-class products. */
						features[i] |= FEATURE_TEMP;
						num_events++;

						/* For Tesla and Quadro products from the Fermi and Kepler families */
						if (isFermi) {
								features[i] |= FEATURE_UTILIZATION;
								num_events += 2;
						}

						strncpy( names[i], name, 64); 

				}
		}
		return PAPI_OK;

}
Ejemplo n.º 12
0
static int
hwloc_nvml_discover(struct hwloc_backend *backend)
{
  struct hwloc_topology *topology = backend->topology;
  nvmlReturn_t ret;
  unsigned nb, i;

  if (!(hwloc_topology_get_flags(topology) & (HWLOC_TOPOLOGY_FLAG_IO_DEVICES|HWLOC_TOPOLOGY_FLAG_WHOLE_IO)))
    return 0;

  if (!hwloc_topology_is_thissystem(topology)) {
    hwloc_debug("%s", "\nno NVML detection (not thissystem)\n");
    return 0;
  }

  ret = nvmlInit();
  if (NVML_SUCCESS != ret)
    return 0;
  ret = nvmlDeviceGetCount(&nb);
  if (NVML_SUCCESS != ret || !nb) {
    nvmlShutdown();
    return 0;
  }

  for(i=0; i<nb; i++) {
    nvmlPciInfo_t pci;
    nvmlDevice_t device;
    hwloc_obj_t osdev, parent;
    char buffer[64];

    ret = nvmlDeviceGetHandleByIndex(i, &device);
    assert(ret == NVML_SUCCESS);

    osdev = hwloc_alloc_setup_object(HWLOC_OBJ_OS_DEVICE, -1);
    snprintf(buffer, sizeof(buffer), "nvml%d", i);
    osdev->name = strdup(buffer);
    osdev->depth = (unsigned) HWLOC_TYPE_DEPTH_UNKNOWN;
    osdev->attr->osdev.type = HWLOC_OBJ_OSDEV_GPU;

    hwloc_obj_add_info(osdev, "Backend", "NVML");
    hwloc_obj_add_info(osdev, "GPUVendor", "NVIDIA Corporation");

    buffer[0] = '\0';
    ret = nvmlDeviceGetName(device, buffer, sizeof(buffer));
    hwloc_obj_add_info(osdev, "GPUModel", buffer);

    /* these may fail with NVML_ERROR_NOT_SUPPORTED on old devices */
    buffer[0] = '\0';
    ret = nvmlDeviceGetSerial(device, buffer, sizeof(buffer));
    if (buffer[0] != '\0')
      hwloc_obj_add_info(osdev, "NVIDIASerial", buffer);

    buffer[0] = '\0';
    ret = nvmlDeviceGetUUID(device, buffer, sizeof(buffer));
    if (buffer[0] != '\0')
      hwloc_obj_add_info(osdev, "NVIDIAUUID", buffer);

    parent = NULL;
    if (NVML_SUCCESS == nvmlDeviceGetPciInfo(device, &pci)) {
      parent = hwloc_pci_belowroot_find_by_busid(topology, pci.domain, pci.bus, pci.device, 0);
      if (!parent)
	parent = hwloc_pci_find_busid_parent(topology, pci.domain, pci.bus, pci.device, 0);
#if HAVE_DECL_NVMLDEVICEGETMAXPCIELINKGENERATION
      if (parent && parent->type == HWLOC_OBJ_PCI_DEVICE) {
	unsigned maxwidth = 0, maxgen = 0;
	float lanespeed;
	nvmlDeviceGetMaxPcieLinkWidth(device, &maxwidth);
	nvmlDeviceGetMaxPcieLinkGeneration(device, &maxgen);
	/* PCIe Gen1 = 2.5GT/s signal-rate per lane with 8/10 encoding    = 0.25GB/s data-rate per lane
	 * PCIe Gen2 = 5  GT/s signal-rate per lane with 8/10 encoding    = 0.5 GB/s data-rate per lane
	 * PCIe Gen3 = 8  GT/s signal-rate per lane with 128/130 encoding = 1   GB/s data-rate per lane
	 */
	lanespeed = maxgen <= 2 ? 2.5 * maxgen * 0.8 : 8.0 * 128/130; /* Gbit/s per lane */
	if (lanespeed * maxwidth)
	  /* we found the max link speed, replace the current link speed found by pci (or none) */
	  parent->attr->pcidev.linkspeed = lanespeed * maxwidth / 8; /* GB/s */
      }
#endif
    }
    if (!parent)
      parent = hwloc_get_root_obj(topology);

    hwloc_insert_object_by_parent(topology, parent, osdev);
  }

  nvmlShutdown();
  return nb;
}
Ejemplo n.º 13
0
static int get_process_info(unsigned int*ncores,unsigned int *valarray)
{

    nvmlReturn_t ret;

    ret=nvmlInit();

    if(ret!=NVML_SUCCESS) {
        fprintf(stderr,"ERROR:: Initialize NVML{%s}..\n",nvmlErrorString(ret));
        return -1;
    }


    unsigned int c;

    ret=nvmlDeviceGetCount(&c);
    if(ret!=NVML_SUCCESS) {
        fprintf(stderr,"ERROR:: Device Get Count{%s}..\n",nvmlErrorString(ret));
        return -1;
    }

    *ncores=c;
    /*
        if(c!=NDEV){
            fprintf(stderr,"ERROR:: Current number of Cores is [%d],not %d....YOU NEED RECOMPILE THIS ROUTINE\n",c,NDEV);
            return -2;
        }
    */
    nvmlDevice_t devs[NDEV];

    nvmlProcessInfo_t pis[MAXPROC];


    int i;
    for(i=0; i<c; i++) {

        ret=nvmlDeviceGetHandleByIndex(i,&devs[i]);
        if(ret!=NVML_SUCCESS) {
            fprintf(stderr,"ERROR:: Device Get Handle{%s}..\n",nvmlErrorString(ret));
            return -1;
        }

        unsigned int np=MAXPROC;
        ret=nvmlDeviceGetComputeRunningProcesses(devs[i],&np,pis);
        if(ret!=NVML_SUCCESS) {
            fprintf(stderr,"ERROR:: GetRunningProcess{%s}..\n",nvmlErrorString(ret));
            return -1;
        }
        valarray[i]=np;

    }

    ret=nvmlShutdown();

    if(ret!=NVML_SUCCESS) {
        fprintf(stderr,"ERROR:: Shutdown NVML{%s}..\n",nvmlErrorString(ret));
        return -1;
    }

    return 0;


}
Ejemplo n.º 14
0
	void CMeasureNVML<TSkipMs, TVariant>::init(void) {
		if(TVariant == VARIANT_FULL) {
			mrLog()
			<< ">>> 'nvml' (full version)" << std::endl;
		} else {
			mrLog()
			<< ">>> 'nvml' (light version)" << std::endl;
		}
		
		nvmlReturn_t result;
		int32_t rv;
		char const* args_set_pm[] = {"gpu_management", "-p 1", NULL};
		
		uint32_t device_count;
		char name[NVML_DEVICE_NAME_BUFFER_SIZE];
		nvmlPciInfo_t pci;
		nvmlEnableState_t mode;
		std::string modes[2] = {"disabled", "enabled"};
		std::stringstream clk_gpu_str;
		std::stringstream clk_mem_str;
		nvmlPstates_t power_state;
		nvmlMemory_t memory;
		
		const uint32_t count			= 32;
		uint32_t clk_mem_cnt			= count;
		uint32_t clk_mem[count];
		uint32_t clk_mem_max			= 0;
		uint32_t clk_mem_min			= 0xffffffff;
		uint32_t clk_mem_set			= 0;
		uint32_t clk_gpu_min_arr_cnt	= count;
		uint32_t clk_gpu_min_arr[clk_gpu_min_arr_cnt];
		uint32_t clk_gpu_min			= 0xffffffff;
		uint32_t clk_gpu_max_arr_cnt	= count;
		uint32_t clk_gpu_max_arr[clk_gpu_max_arr_cnt];
		uint32_t clk_gpu_max			= 0;
		uint32_t clk_gpu_set			= 0;
		uint32_t memory_total			= 0;
		
		result = nvmlInit();
		if (NVML_SUCCESS != result) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot initialize nvml library. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		result = nvmlDeviceGetCount(&device_count);
		if (NVML_SUCCESS != result) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot query device count. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		if (device_count > 1) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: this software has be rewritten if you want to support more than 1 device. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		mrLog() << ">>> 'nvml' (thread main): get gpu device handler...";
		mrLog.flush();
		
		result = nvmlDeviceGetHandleByIndex(0, &mDevice);
		if (NVML_SUCCESS != result) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot get device handler. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		mrLog() << " done!" << std::endl;
		
		result = nvmlDeviceGetName(mDevice, name, NVML_DEVICE_NAME_BUFFER_SIZE);
		if (NVML_SUCCESS != result) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot get device name. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		result = nvmlDeviceGetPciInfo(mDevice, &pci);
		if (NVML_SUCCESS != result) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot get pci information. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		result = nvmlDeviceGetPowerManagementMode(mDevice, &mode);
		if (NVML_SUCCESS != result) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: no power managment supported. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		result = nvmlDeviceGetPerformanceState(mDevice, &power_state);
		if (NVML_SUCCESS != result) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: no performance state reading possible. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		result = nvmlDeviceGetSupportedMemoryClocks(mDevice, &clk_mem_cnt, clk_mem);
		if (NVML_SUCCESS != result) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot obtain memory clock. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		for (int i=0; i<(int32_t)clk_mem_cnt; ++i) {
			clk_mem_min = (clk_mem[i]<clk_mem_min) ? clk_mem[i] : clk_mem_min;
			clk_mem_max = (clk_mem[i]>clk_mem_max) ? clk_mem[i] : clk_mem_max;
		}
		
		result = nvmlDeviceGetSupportedGraphicsClocks(mDevice, clk_mem_min, &clk_gpu_min_arr_cnt, clk_gpu_min_arr);
		if (NVML_SUCCESS != result) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot obtain graphics clock. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		for (int32_t i=0; i<(int32_t)clk_gpu_min_arr_cnt; ++i) {
			clk_gpu_min = (clk_gpu_min_arr[i]<clk_gpu_min) ? clk_gpu_min_arr[i] : clk_gpu_min;
		}
		
		result = nvmlDeviceGetSupportedGraphicsClocks(mDevice, clk_mem_max, &clk_gpu_max_arr_cnt, clk_gpu_max_arr);
		if (NVML_SUCCESS != result) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot obtain graphics clock. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		for (int32_t i=0; i<(int32_t)clk_gpu_max_arr_cnt; ++i) {
			clk_gpu_max = (clk_gpu_max_arr[i]>clk_gpu_max) ? clk_gpu_max_arr[i] : clk_gpu_max;
		}
		
		result = nvmlDeviceGetMemoryInfo(mDevice, &memory);
		if (NVML_SUCCESS != result) {
			mrLog.lock();
			mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread main): Error: cannot obtain memory informations. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			mrLog.unlock();
			exit(EXIT_FAILURE);
		}
		memory_total = (uint32_t)(memory.total >> 20);
		
		rv = exec_gpu_mgmt((char**)args_set_pm);
		if (rv) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: in gpu_management tool. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		mrLog()
		<< ">>> 'nvml' (thread main): persistence mode enabled." << std::endl;
		
		mrLog()
		<< ">>> 'nvml' (thread main):" << std::endl
		<< "     device         : " << name << std::endl
		<< "     pcie           : " << pci.busId << std::endl
		<< "     power mgmt mode: " << modes[mode] << std::endl
		<< "     power state cur: " << power_state << std::endl
		<< "     power state min: " << NVML_PSTATE_15 << std::endl
		<< "     power state max: " << NVML_PSTATE_0 << std::endl
		<< "     memory total   : " << memory_total << " MiB" << std::endl
		<< "     avail mem clks : ";
		for (int i=0; i<(int32_t)clk_mem_cnt; ++i) {
			if (i<(int32_t)clk_mem_cnt-1) {
				mrLog() << clk_mem[i] << " MHz, ";
			} else {
				mrLog() << clk_mem[i] << " MHz" << std::endl;
			}
		}
		
		mrLog()
		<< "     memory clk min : " << clk_mem_min << " MHz" << std::endl
		<< "     avail core clks: ";
		for (int32_t i=0; i<(int32_t)clk_gpu_min_arr_cnt; ++i) {
			if (i<(int32_t)clk_gpu_min_arr_cnt-1) {
				mrLog() << clk_gpu_min_arr[i] << " MHz, ";
			} else {
				mrLog() << clk_gpu_min_arr[i] << " MHz" << std::endl;
			}
		}
		mrLog()
		<< "     core clk min   : " << clk_gpu_min << " MHz" << std::endl;
		
		mrLog()
		<< "     memory clk max : " << clk_mem_max << " MHz" << std::endl
		<< "     avail core clks: ";
		for (int32_t i=0; i<(int32_t)clk_gpu_max_arr_cnt; ++i) {
			if (i<(int32_t)clk_gpu_max_arr_cnt-1) {
				mrLog() << clk_gpu_max_arr[i] << " MHz, ";
			} else {
				mrLog() << clk_gpu_max_arr[i] << " MHz" << std::endl;
			}
		}
		mrLog()
		<< "     core clk max   : " << clk_gpu_max << " MHz" << std::endl;
		
		switch (mGpuFrequency) {
			case GPU_FREQUENCY_MIN:
				clk_mem_set = clk_mem_min;
				clk_gpu_set = clk_gpu_min;
				break;
			case GPU_FREQUENCY_MAX:
				clk_mem_set = clk_mem_max;
				clk_gpu_set = clk_gpu_max;
				break;
			case GPU_FREQUENCY_CUR:
			default:
				clk_mem_set = 0;
				clk_gpu_set = 0;
				
				result = nvmlDeviceGetClockInfo(mDevice, NVML_CLOCK_MEM, &clk_mem_set);
				if (NVML_SUCCESS != result) {
					mrLog.lock();
					mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread main): Error: cannot read frequency. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
					mrLog.unlock();
					exit(EXIT_FAILURE);
				}
				
				result = nvmlDeviceGetClockInfo(mDevice, NVML_CLOCK_GRAPHICS, &clk_gpu_set);
				if (NVML_SUCCESS != result) {
					mrLog.lock();
					mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread main): Error: cannot read frequency. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
					mrLog.unlock();
					exit(EXIT_FAILURE);
				}
				
				break;
		}
		
		if (mGpuFrequency == GPU_FREQUENCY_MIN || mGpuFrequency == GPU_FREQUENCY_MAX) {
			// In these cases we actually set the GPU frequencies either to the maximum or minimum value.
			clk_gpu_str << "-c " << clk_gpu_set;
			clk_mem_str << "-m " << clk_mem_set;
			char const* args_set_clk[] = {"gpu_management", clk_gpu_str.str().c_str() , clk_mem_str.str().c_str(), NULL};
			rv = exec_gpu_mgmt((char**)args_set_clk);
			if (rv) {
				mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: in gpu_management tool. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
				exit(EXIT_FAILURE);
			}
						
			mrLog()
			<< ">>> 'nvml' (thread main): set core clk to " << clk_gpu_set << " MHz and mem clk to " << clk_mem_set << " MHz." << std::endl;
		} else {
			// We name the values *_set, but we don't set the frequency. We just print the current GPU frequency.
			mrLog()
			<< ">>> 'nvml' (thread main): current core clk is " << clk_gpu_set << " MHz and mem clk is " << clk_mem_set << " MHz." << std::endl;
		}
		mrLog()
		<< ">>> 'nvml' (thread main): wait for 15s to throttle gpu clocks." << std::endl;
		sleep(15);
		
		mrLog()
		<< ">>> 'nvml' (thread main): initialization done." << std::endl
		<< std::endl;
	}
Ejemplo n.º 15
0
int Machine::initializeNVIDIADevices(hwloc_obj_t machine_obj, hwloc_topology_t topology)
  {
  nvmlReturn_t rc;

  /* Initialize the NVML handle. 
   *
   * nvmlInit should be called once before invoking any other methods in the NVML library. 
   * A reference count of the number of initializations is maintained. Shutdown only occurs 
   * when the reference count reaches zero.
   * */
  rc = nvmlInit();
  if (rc != NVML_SUCCESS && rc != NVML_ERROR_ALREADY_INITIALIZED)
    {
    log_nvml_error(rc, NULL, __func__);
    return(PBSE_NONE);
    }

  unsigned int device_count = 0;

  /* Get the device count. */
  rc = nvmlDeviceGetCount(&device_count);
  if (rc == NVML_SUCCESS)
    {
    nvmlDevice_t gpu;

    /* Get the nvml device handle at each index */
    for (unsigned int idx = 0; idx < device_count; idx++)
      {
      rc = nvmlDeviceGetHandleByIndex(idx, &gpu);

      if (rc != NVML_SUCCESS)
        {
        /* TODO: get gpuid from nvmlDevice_t struct */
        log_nvml_error(rc, NULL, __func__);
        }

      /* Use the hwloc library to determine device locality */
      hwloc_obj_t gpu_obj;
      hwloc_obj_t ancestor_obj;
      int is_in_tree;
  
      gpu_obj = hwloc_nvml_get_device_osdev(topology, gpu);
      if (gpu_obj == NULL)
        {
        /* This was not an nvml device. We will look for a "card" device (GeForce or Quadra) */
        gpu_obj = this->get_non_nvml_device(topology, gpu);
        if (gpu_obj == NULL)
        continue;
        }
        
      /* The ancestor was not a numa chip. Is it the machine? */
      ancestor_obj = hwloc_get_ancestor_obj_by_type(topology, HWLOC_OBJ_MACHINE, gpu_obj);
      if (ancestor_obj != NULL)
        {
        PCI_Device new_device;
  
        new_device.initializePCIDevice(gpu_obj, idx, topology);

        store_device_on_appropriate_chip(new_device);
        }
      }
    }
  else
    {
    log_nvml_error(rc, NULL, __func__);
    }

  /* Shutdown the NVML handle. 
   *
   * nvmlShutdown should be called after NVML work is done, once for each call to nvmlInit() 
   * A reference count of the number of initializations is maintained. Shutdown only occurs when 
   * the reference count reaches zero. For backwards compatibility, no error is reported if 
   * nvmlShutdown() is called more times than nvmlInit().
   * */
  rc = nvmlShutdown();
  if (rc != NVML_SUCCESS)
    {
    log_nvml_error(rc, NULL, __func__);
    }

  return(PBSE_NONE);
  }
Ejemplo n.º 16
0
int main()
{
    nvmlReturn_t result;
    unsigned int device_count, i;

    // First initialize NVML library
    result = nvmlInit();
    if (NVML_SUCCESS != result)
    { 
        printf("Failed to initialize NVML: %s\n", nvmlErrorString(result));

        printf("Press ENTER to continue...\n");
        getchar();
        return 1;
    }

    result = nvmlDeviceGetCount(&device_count);
    if (NVML_SUCCESS != result)
    { 
        printf("Failed to query device count: %s\n", nvmlErrorString(result));
        goto Error;
    }
    printf("Found %d device%s\n\n", device_count, device_count != 1 ? "s" : "");

    printf("Listing devices:\n");    
    for (i = 0; i < device_count; i++)
    {
        nvmlDevice_t device;
        char name[NVML_DEVICE_NAME_BUFFER_SIZE];
        nvmlPciInfo_t pci;
        nvmlComputeMode_t compute_mode;

        // Query for device handle to perform operations on a device
        // You can also query device handle by other features like:
        // nvmlDeviceGetHandleBySerial
        // nvmlDeviceGetHandleByPciBusId
        result = nvmlDeviceGetHandleByIndex(i, &device);
        if (NVML_SUCCESS != result)
        { 
            printf("Failed to get handle for device %i: %s\n", i, nvmlErrorString(result));
            goto Error;
        }

        result = nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
        if (NVML_SUCCESS != result)
        { 
            printf("Failed to get name of device %i: %s\n", i, nvmlErrorString(result));
            goto Error;
        }
        
        // pci.busId is very useful to know which device physically you're talking to
        // Using PCI identifier you can also match nvmlDevice handle to CUDA device.
        result = nvmlDeviceGetPciInfo(device, &pci);
        if (NVML_SUCCESS != result)
        { 
            printf("Failed to get pci info for device %i: %s\n", i, nvmlErrorString(result));
            goto Error;
        }

        printf("%d. %s [%s]\n", i, name, pci.busId);

        // This is a simple example on how you can modify GPU's state
        result = nvmlDeviceGetComputeMode(device, &compute_mode);
        if (NVML_ERROR_NOT_SUPPORTED == result)
            printf("\t This is not CUDA capable device\n");
        else if (NVML_SUCCESS != result)
        { 
            printf("Failed to get compute mode for device %i: %s\n", i, nvmlErrorString(result));
            goto Error;
        }
        else
        {
            // try to change compute mode
            printf("\t Changing device's compute mode from '%s' to '%s'\n", 
                    convertToComputeModeString(compute_mode), 
                    convertToComputeModeString(NVML_COMPUTEMODE_PROHIBITED));

            result = nvmlDeviceSetComputeMode(device, NVML_COMPUTEMODE_PROHIBITED);
            if (NVML_ERROR_NO_PERMISSION == result)
                printf("\t\t Need root privileges to do that: %s\n", nvmlErrorString(result));
            else if (NVML_ERROR_NOT_SUPPORTED == result)
                printf("\t\t Compute mode prohibited not supported. You might be running on\n"
                       "\t\t windows in WDDM driver model or on non-CUDA capable GPU.\n");
            else if (NVML_SUCCESS != result)
            {
                printf("\t\t Failed to set compute mode for device %i: %s\n", i, nvmlErrorString(result));
                goto Error;
            } 
            else
            {
                printf("\t Restoring device's compute mode back to '%s'\n", 
                        convertToComputeModeString(compute_mode));
                result = nvmlDeviceSetComputeMode(device, compute_mode);
                if (NVML_SUCCESS != result)
                { 
                    printf("\t\t Failed to restore compute mode for device %i: %s\n", i, nvmlErrorString(result));
                    goto Error;
                }
            }
        }
    }

    result = nvmlShutdown();
    if (NVML_SUCCESS != result)
        printf("Failed to shutdown NVML: %s\n", nvmlErrorString(result));

    printf("All done.\n");

    printf("Press ENTER to continue...\n");
    getchar();
    return 0;

Error:
    result = nvmlShutdown();
    if (NVML_SUCCESS != result)
        printf("Failed to shutdown NVML: %s\n", nvmlErrorString(result));

    printf("Press ENTER to continue...\n");
    getchar();
    return 1;
}
Ejemplo n.º 17
0
int main(int argc,char* argv[]){

  /**Initialize signal**/

  signal(SIGINT ,_end_server);
  signal(SIGUSR1,_end_server);

  /**Initialize struct proc**/
  init_proc();
  init_cons();

  /**Process becomes dem**/
  pid_t process_id = 0;
  pid_t sid = 0;

  if(argc >= 2){

    process_id = fork();

    if(process_id < 0){
      printf("fork failed ..\n");
      exit(1);
    }

    if(process_id > 0){
      exit(0);
    }

    umask(0);

    sid = setsid();

    if(sid < 0){
      exit(1);
    }

    close(STDIN_FILENO);
    close(STDOUT_FILENO);
    close(STDERR_FILENO);

  }else{
    sid = getpid();
  }

  /**Setup the log file**/

  char log[32];

  sprintf(log,"log.%u",sid);

  //  fp = fopen(log,"w+");

  /**Start Initialize nvidia management library from Here!!**/

  nvmlReturn_t nres;
  int i;
  
  nres = nvmlInit();

  if(nres != NVML_SUCCESS){
    perror("Failed to initialize Nvidia Managerment Library...\n");
    exit(-1);
  }

  nres = nvmlDeviceGetCount(&dem.ndev);

  if(nres != NVML_SUCCESS){
    perror("Failed to get num of device...\n");
    exit(-1);
  }

  dem.devs = (nvmlDevice_t*)malloc(sizeof(nvmlDevice_t)*dem.ndev);
  dem.flags = (dflag*)malloc(sizeof(dflag)*dem.ndev);

  MAXPROC = dem.ndev * 4;

  for(i = 0 ; i < dem.ndev ; i ++){

    nres = nvmlDeviceGetHandleByIndex(i,&dem.devs[i]);

    if(nres != NVML_SUCCESS){
      perror("Failed to get device handle\n");
      exit(-1);
    }

    dem.flags[i].sd = -1;
    dem.flags[i].flag = 0;
    dem.flags[i].stayed = 0;
    dem.flags[i].reserved = 0;
  }

  dem.procCounter = 0;

  /**Setup the socket**/

  int len,rc,on = 1;
  int listen_sd,max_sd,new_sd;
  int desc_ready;
  int close_conn;

  struct sockaddr_un addr;
  struct timeval timeout;
  fd_set master_set,working_set;

  listen_sd = socket(AF_UNIX,SOCK_STREAM,0);

  if(listen_sd < 0){
    perror("socket() failed\n");
    exit(-1);
  }

  rc = setsockopt(listen_sd, SOL_SOCKET, SO_REUSEADDR, (char*)&on,sizeof(on));

  if(rc < 0){
    perror("setsockopt() failed\n");
    exit(-1);
  }

  unlink("mocu_server");

  memset(&addr,0,sizeof(addr));

  addr.sun_family = AF_UNIX;
  strcpy(addr.sun_path,"mocu_server");

  rc = bind(listen_sd,(struct sockaddr*)&addr,sizeof(addr));

  if(rc < 0){
    perror("bind() failed");
    close(listen_sd);
    exit(-1);
  }

  rc = listen(listen_sd,SOMAXCONN);

  if(rc < 0){
    perror("listen() failed");
    close(listen_sd);
    exit(-1);
  }

  FD_ZERO(&master_set);
  max_sd = listen_sd;
  FD_SET(listen_sd,&master_set);

  timeout.tv_sec = 3*60;
  timeout.tv_usec = 0;

  long counter = 0;

  /**Entering main loop**/
  proc_data* receivedProc = (proc_data*)malloc(sizeof(proc_data));

  mocu_check();

  do{

    memcpy(&working_set,&master_set,sizeof(master_set));

    rc = select(max_sd+1, &working_set, NULL, NULL, NULL);

    if(rc < 0){
      perror("select() failed\n");
      break;
    }

    if(rc == 0){
      printf("select() time out. End program.\n");
      break;
    }

    desc_ready = rc;

    for(i = 0 ; i < max_sd+1 && desc_ready > 0 ; ++i){
      
      if(FD_ISSET(i,&working_set)){

	desc_ready = -1;

	if(i == listen_sd){

	  new_sd = accept(listen_sd,NULL,NULL);

	  if(new_sd < 0){
	    printf("accept() failed");
	    end_server = TRUE;
	  }

	  FD_SET(new_sd,&master_set);

	  if(new_sd > max_sd){
	    max_sd = new_sd;
	  }

	}else{

	  rc = recv(i,receivedProc,sizeof(proc_data),0);

	  if(rc <= 0){

	    FD_CLR(i,&master_set);

	    _FIN(i);

	  }else{

	    if(receivedProc->REQUEST == CONNECT){

	      _CONNECT(i,receivedProc);

	    }else if(receivedProc->REQUEST == RENEW){

	      _RENEW(i,receivedProc);
	      
	    }else if(receivedProc->REQUEST == MIGDONE){

	      _MIGDONE(i,receivedProc);
	      
	    }else if(receivedProc->REQUEST == CANRECEIVE){

	      _CANRECEIVE(i,receivedProc);
	      
	    }else if(receivedProc->REQUEST == FAILEDTOALLOC){

	      _FAILEDTOALLOC(i,receivedProc);

	      exit(-1);//TEST
	      
	    }else if(receivedProc->REQUEST == MALLOCDONE){

	      _MALLOCDONE(i,receivedProc);

	    }else if(receivedProc->REQUEST == CUDAMALLOC){

	      _CUDAMALLOC(i,receivedProc);

	    }else if(receivedProc->REQUEST == BACKUPED){

	      _BACKUPED(i,receivedProc);
	      
	    }else if(receivedProc->REQUEST == CONTEXT_CHECK){

	      _CONTEXT_CHECK(i,receivedProc);
	      
	    }else if(receivedProc->REQUEST == CREATE_CONTEXT){

	      _CREATE_CONTEXT(i);

	    }else if(receivedProc->REQUEST == CONSOLE){

	      _CONSOLE(i);
	      
	    }else{
	      printf("Unkown request...\n");
	      exit(-1);
	    }
	  }
	}
      }
    }

    mocu_check();

  }while(end_server == FALSE);

  int closed = 0;

  for(i = 0 ; i < max_sd ; i ++){
    if(FD_ISSET(i,&master_set)){
      close(i);
      closed = 1;
    }
  }

  //  fclose(fp);

  return 0;
}
Ejemplo n.º 18
0
  int readNvmlCounters(HSP *sp, SFLHost_gpu_nvml *nvml) {
    unsigned int i;

    if(sp->nvml.gpu_count == 0) {
      return NO;
    }

    // pick up latest value of accumulators
    nvml->gpu_time = sp->nvml.nvml_gpu_time;
    nvml->mem_time = sp->nvml.nvml_mem_time;
    nvml->energy = sp->nvml.nvml_energy;

    // and fill in the rest of the counters/gauges too
    nvml->device_count = sp->nvml.gpu_count;

    // zero these, and sum across all GPUs
    nvml->mem_total = 0;
    nvml->mem_free = 0;
    nvml->ecc_errors = 0;
    nvml->processes = 0;

    // use the max across all GPUs
    nvml->temperature = 0;
    nvml->fan_speed = 0;

    for (i = 0; i < sp->nvml.gpu_count; ++i) {
      unsigned long long eccErrors;
      unsigned int temp;
      nvmlDevice_t gpu;
      unsigned int speed;
      unsigned int procs;
      nvmlMemory_t memInfo;
      nvmlReturn_t result;

      if (NVML_SUCCESS != nvmlDeviceGetHandleByIndex(i, &gpu)) {
        return NO;
      }
      if (NVML_SUCCESS == nvmlDeviceGetMemoryInfo(gpu, &memInfo)) {
        nvml->mem_total += memInfo.total;
        nvml->mem_free  += memInfo.free;
      }
      if (NVML_SUCCESS == nvmlDeviceGetTotalEccErrors(gpu, NVML_SINGLE_BIT_ECC, NVML_VOLATILE_ECC, &eccErrors)) {
        nvml->ecc_errors += eccErrors;
      }
      if (NVML_SUCCESS == nvmlDeviceGetTotalEccErrors(gpu, NVML_DOUBLE_BIT_ECC, NVML_VOLATILE_ECC, &eccErrors)) {
        nvml->ecc_errors += eccErrors;
      }
      if (NVML_SUCCESS == nvmlDeviceGetTemperature(gpu, NVML_TEMPERATURE_GPU, &temp)) {
        if (nvml->temperature < temp) {
          nvml->temperature = temp;
        }
      }
      if (NVML_SUCCESS == nvmlDeviceGetFanSpeed(gpu, &speed)) {
        if (nvml->fan_speed < speed) {
          nvml->fan_speed = speed;
        }
      }
      result = nvmlDeviceGetComputeRunningProcesses(gpu, &procs, NULL);
      if (NVML_SUCCESS == result || NVML_ERROR_INSUFFICIENT_SIZE == result) {
        nvml->processes += procs;
      }
    }

    return YES;
  }