Exemple #1
0
nvmlReturn_t NVML_DL(nvmlInit)(void)
{
    handle = dlopen("libnvidia-ml.so.1", RTLD_LAZY | RTLD_GLOBAL);
    if (handle == NULL) {
	return (NVML_ERROR_LIBRARY_NOT_FOUND);
    }
    return (nvmlInit());
}
int probe_gpustats(devstat**stats)
{

    unsigned int n_dev;
    nvmlReturn_t nvret;


    nvret=nvmlInit();
    CHK_NVML(nvret,"Init NVML");


    nvret=nvmlDeviceGetCount(&n_dev);
    CHK_NVML(nvret,"getCount");


    *stats=(devstat*)calloc(n_dev,sizeof(devstat));
    devstat*pstats=*stats;


    int i;
    for(i=0;i<n_dev;i++)
        nvmlDeviceGetHandleByIndex(i,&pstats[i].handler);

    
    for(i=0;i<n_dev;i++)
        nvmlDeviceGetMemoryInfo(pstats[i].handler,&pstats[i].meminfo);
    
    for(i=0;i<n_dev;i++)
        nvmlDeviceGetUtilizationRates(pstats[i].handler,&pstats[i].utils);

    unsigned int sampp;
    for(i=0;i<n_dev;i++)
        nvmlDeviceGetEncoderUtilization(pstats[i].handler,&pstats[i].encutil,&sampp);

    for(i=0;i<n_dev;i++)
        nvmlDeviceGetDecoderUtilization(pstats[i].handler,&pstats[i].decutil,&sampp);
#if 0
    int maxfreeind=0;
    int maxfree=0;
    for(i=0;i<n_dev;i++){

        print_devstats(&pstats[i]);

        int free=pstats[i].meminfo.free; 
//        fprintf(stderr,"<%d\n",free);
        if(free>maxfree){
            maxfree=free;
            maxfreeind=i;
        }

    }
#endif
    nvret=nvmlShutdown();
    CHK_NVML(nvret,"Shutdown NVML");


    return n_dev;
}
Exemple #3
0
static int get_mem_info(unsigned int*ncores,unsigned int*usedarray)
{

    nvmlReturn_t ret;
    ret=nvmlInit();


    if(ret!=NVML_SUCCESS) {
        fprintf(stderr,"ERROR:: Initialize NVML{%s}..\n",nvmlErrorString(ret));
        return -1;
    }


    unsigned int c;

    ret=nvmlDeviceGetCount(&c);
    if(ret!=NVML_SUCCESS) {
        fprintf(stderr,"ERROR:: Device Get Count{%s}..\n",nvmlErrorString(ret));
        return -1;
    }

    *ncores=c;

    nvmlDevice_t devs[NDEV];
    nvmlMemory_t meminfo;


    int i;
    for(i=0; i<c; i++) {

        ret=nvmlDeviceGetHandleByIndex(i,&devs[i]);
        if(ret!=NVML_SUCCESS) {
            fprintf(stderr,"ERROR:: Device Get Handle{%s}..\n",nvmlErrorString(ret));
            return -1;
        }

        ret=nvmlDeviceGetMemoryInfo(devs[i],&meminfo);
        if(ret!=NVML_SUCCESS) {
            fprintf(stderr,"ERROR:: GetMemoryInfo{%s}..\n",nvmlErrorString(ret));
            return -1;
        }
        usedarray[i]=meminfo.used;

    }

    ret=nvmlShutdown();

    if(ret!=NVML_SUCCESS) {
        fprintf(stderr,"ERROR:: Shutdown NVML{%s}..\n",nvmlErrorString(ret));
        return -1;
    }

    return 0;

}
  /*_________________---------------------------__________________
    _________________     nvml_init             __________________
    -----------------___________________________------------------
    Called at startup
  */
  void nvml_init(HSP *sp) {
    unsigned int gpuCount;

    if (NVML_SUCCESS != nvmlInit()) {
      return;
    }
    if (NVML_SUCCESS != nvmlDeviceGetCount(&gpuCount)) {
      return;
    }
    sp->nvml.gpu_count = gpuCount;
  }
Exemple #5
0
// NVIDIA NVML library function wrapper for GPU DVFS.
int SetGPUFreq(unsigned int clock_mem, unsigned int clock_core) {
    nvmlDevice_t device;//int device;
    nvmlReturn_t result;
    result = nvmlInit();
    result = nvmlDeviceGetHandleByIndex(0, &device);//cudaGetDevice(&device);
    result = nvmlDeviceSetApplicationsClocks(device, clock_mem, clock_core);//(nvmlDevice_t)device
    if(result != NVML_SUCCESS)
    {
        printf("Failed to set GPU core and memory frequencies: %s\n", nvmlErrorString(result));
        return 1;
    }
    else
    {
        nvmlDeviceGetApplicationsClock(device, NVML_CLOCK_GRAPHICS, &clock_core);
        nvmlDeviceGetApplicationsClock(device, NVML_CLOCK_MEM, &clock_mem);
        ////printf("GPU core frequency is now set to %d MHz; GPU memory frequency is now set to %d MHz", clock_core, clock_mem);
        return 0;
    }
}
/*
 * Class:     org_apache_hadoop_yarn_server_nodemanager_containermanager_launcher_GPUMonitor
 * Method:    initnvml
 * Signature: ()Ljava/lang/String;
 */
JNIEXPORT jstring JNICALL Java_org_apache_hadoop_yarn_server_nodemanager_containermanager_launcher_GPUMonitor_initnvml
  (JNIEnv *env, jobject)
{
    nvmlReturn_t result;
    unsigned int device_count, i;
	char sentence[200];
	std::string err = "";

    result = nvmlInit();
    if (NVML_SUCCESS != result) { 
        printf("Failed to initialize NVML: %s\n", nvmlErrorString(result));
        sprintf(sentence, "Failed to initialize NVML: %s\n", nvmlErrorString(result));
		err.append( (std::string)sentence );
    }
	char name[NVML_DEVICE_NAME_BUFFER_SIZE];
	result = nvmlDeviceGetHandleByIndex(0, &device);
	if (NVML_SUCCESS != result) { 
		printf("Failed to get handle for device %i: %s\n", i, nvmlErrorString(result));
		sprintf(sentence,"Failed to get handle for device %i: %s\n", i, nvmlErrorString(result));
		err.append( (std::string)sentence );
		result = nvmlShutdown();
		return 0;
	}
	result = nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
	if (NVML_SUCCESS != result) { 
		printf("Failed to get name of device %i: %s\n", i, nvmlErrorString(result));
		sprintf(sentence,"Failed to get name of device %i: %s\n", i, nvmlErrorString(result));
		err.append( (std::string)sentence );
		result = nvmlShutdown();
		return 0;
	}
	printf("Device : %s\n",name);
	sprintf(sentence,"Device : %s\n",name);
	err.append( (std::string)sentence );
	return env->NewStringUTF( err.c_str() );
}
Exemple #7
0
int main(int argc,char* argv[]){

  /**Initialize signal**/

  signal(SIGINT ,_end_server);
  signal(SIGUSR1,_end_server);

  /**Initialize struct proc**/
  init_proc();
  init_cons();

  /**Process becomes dem**/
  pid_t process_id = 0;
  pid_t sid = 0;

  if(argc >= 2){

    process_id = fork();

    if(process_id < 0){
      printf("fork failed ..\n");
      exit(1);
    }

    if(process_id > 0){
      exit(0);
    }

    umask(0);

    sid = setsid();

    if(sid < 0){
      exit(1);
    }

    close(STDIN_FILENO);
    close(STDOUT_FILENO);
    close(STDERR_FILENO);

  }else{
    sid = getpid();
  }

  /**Setup the log file**/

  char log[32];

  sprintf(log,"log.%u",sid);

  //  fp = fopen(log,"w+");

  /**Start Initialize nvidia management library from Here!!**/

  nvmlReturn_t nres;
  int i;
  
  nres = nvmlInit();

  if(nres != NVML_SUCCESS){
    perror("Failed to initialize Nvidia Managerment Library...\n");
    exit(-1);
  }

  nres = nvmlDeviceGetCount(&dem.ndev);

  if(nres != NVML_SUCCESS){
    perror("Failed to get num of device...\n");
    exit(-1);
  }

  dem.devs = (nvmlDevice_t*)malloc(sizeof(nvmlDevice_t)*dem.ndev);
  dem.flags = (dflag*)malloc(sizeof(dflag)*dem.ndev);

  MAXPROC = dem.ndev * 4;

  for(i = 0 ; i < dem.ndev ; i ++){

    nres = nvmlDeviceGetHandleByIndex(i,&dem.devs[i]);

    if(nres != NVML_SUCCESS){
      perror("Failed to get device handle\n");
      exit(-1);
    }

    dem.flags[i].sd = -1;
    dem.flags[i].flag = 0;
    dem.flags[i].stayed = 0;
    dem.flags[i].reserved = 0;
  }

  dem.procCounter = 0;

  /**Setup the socket**/

  int len,rc,on = 1;
  int listen_sd,max_sd,new_sd;
  int desc_ready;
  int close_conn;

  struct sockaddr_un addr;
  struct timeval timeout;
  fd_set master_set,working_set;

  listen_sd = socket(AF_UNIX,SOCK_STREAM,0);

  if(listen_sd < 0){
    perror("socket() failed\n");
    exit(-1);
  }

  rc = setsockopt(listen_sd, SOL_SOCKET, SO_REUSEADDR, (char*)&on,sizeof(on));

  if(rc < 0){
    perror("setsockopt() failed\n");
    exit(-1);
  }

  unlink("mocu_server");

  memset(&addr,0,sizeof(addr));

  addr.sun_family = AF_UNIX;
  strcpy(addr.sun_path,"mocu_server");

  rc = bind(listen_sd,(struct sockaddr*)&addr,sizeof(addr));

  if(rc < 0){
    perror("bind() failed");
    close(listen_sd);
    exit(-1);
  }

  rc = listen(listen_sd,SOMAXCONN);

  if(rc < 0){
    perror("listen() failed");
    close(listen_sd);
    exit(-1);
  }

  FD_ZERO(&master_set);
  max_sd = listen_sd;
  FD_SET(listen_sd,&master_set);

  timeout.tv_sec = 3*60;
  timeout.tv_usec = 0;

  long counter = 0;

  /**Entering main loop**/
  proc_data* receivedProc = (proc_data*)malloc(sizeof(proc_data));

  mocu_check();

  do{

    memcpy(&working_set,&master_set,sizeof(master_set));

    rc = select(max_sd+1, &working_set, NULL, NULL, NULL);

    if(rc < 0){
      perror("select() failed\n");
      break;
    }

    if(rc == 0){
      printf("select() time out. End program.\n");
      break;
    }

    desc_ready = rc;

    for(i = 0 ; i < max_sd+1 && desc_ready > 0 ; ++i){
      
      if(FD_ISSET(i,&working_set)){

	desc_ready = -1;

	if(i == listen_sd){

	  new_sd = accept(listen_sd,NULL,NULL);

	  if(new_sd < 0){
	    printf("accept() failed");
	    end_server = TRUE;
	  }

	  FD_SET(new_sd,&master_set);

	  if(new_sd > max_sd){
	    max_sd = new_sd;
	  }

	}else{

	  rc = recv(i,receivedProc,sizeof(proc_data),0);

	  if(rc <= 0){

	    FD_CLR(i,&master_set);

	    _FIN(i);

	  }else{

	    if(receivedProc->REQUEST == CONNECT){

	      _CONNECT(i,receivedProc);

	    }else if(receivedProc->REQUEST == RENEW){

	      _RENEW(i,receivedProc);
	      
	    }else if(receivedProc->REQUEST == MIGDONE){

	      _MIGDONE(i,receivedProc);
	      
	    }else if(receivedProc->REQUEST == CANRECEIVE){

	      _CANRECEIVE(i,receivedProc);
	      
	    }else if(receivedProc->REQUEST == FAILEDTOALLOC){

	      _FAILEDTOALLOC(i,receivedProc);

	      exit(-1);//TEST
	      
	    }else if(receivedProc->REQUEST == MALLOCDONE){

	      _MALLOCDONE(i,receivedProc);

	    }else if(receivedProc->REQUEST == CUDAMALLOC){

	      _CUDAMALLOC(i,receivedProc);

	    }else if(receivedProc->REQUEST == BACKUPED){

	      _BACKUPED(i,receivedProc);
	      
	    }else if(receivedProc->REQUEST == CONTEXT_CHECK){

	      _CONTEXT_CHECK(i,receivedProc);
	      
	    }else if(receivedProc->REQUEST == CREATE_CONTEXT){

	      _CREATE_CONTEXT(i);

	    }else if(receivedProc->REQUEST == CONSOLE){

	      _CONSOLE(i);
	      
	    }else{
	      printf("Unkown request...\n");
	      exit(-1);
	    }
	  }
	}
      }
    }

    mocu_check();

  }while(end_server == FALSE);

  int closed = 0;

  for(i = 0 ; i < max_sd ; i ++){
    if(FD_ISSET(i,&master_set)){
      close(i);
      closed = 1;
    }
  }

  //  fclose(fp);

  return 0;
}
Exemple #8
0
/** Initialize hardware counters, setup the function vector table
 * and get hardware information, this routine is called when the
 * PAPI process is initialized (IE PAPI_library_init)
 */
		int
_papi_nvml_init_substrate( int cidx )
{
		nvmlReturn_t ret;
		cudaError_t cuerr;

		int cuda_count = 0;
		unsigned int nvml_count = 0;

		ret = nvmlInit();
		if ( NVML_SUCCESS != ret ) {
				strcpy(_nvml_vector.cmp_info.disabled_reason, "The NVIDIA managament library failed to initialize.");
				goto disable;
		}

		cuerr = cuInit( 0 );
		if ( CUDA_SUCCESS != cuerr ) {
				strcpy(_nvml_vector.cmp_info.disabled_reason, "The CUDA library failed to initialize.");
				goto disable;
		}

		/* Figure out the number of CUDA devices in the system */
		ret = nvmlDeviceGetCount( &nvml_count );
		if ( NVML_SUCCESS != ret ) {
				strcpy(_nvml_vector.cmp_info.disabled_reason, "Unable to get a count of devices from the NVIDIA managament library.");
				goto disable;
		}

		cuerr = cudaGetDeviceCount( &cuda_count );
		if ( CUDA_SUCCESS != cuerr ) {
				strcpy(_nvml_vector.cmp_info.disabled_reason, "Unable to get a device count from CUDA.");
				goto disable;
		}

		/* We can probably recover from this, when we're clever */
		if ( nvml_count != cuda_count ) {
				strcpy(_nvml_vector.cmp_info.disabled_reason, "Cuda and the NVIDIA managament library have different device counts.");
				goto disable;
		}

		device_count = cuda_count;

		/* A per device representation of what events are present */
		features = (int*)papi_malloc(sizeof(int) * device_count );

		/* Handles to each device */
		devices = (nvmlDevice_t*)papi_malloc(sizeof(nvmlDevice_t) * device_count);

		/* Figure out what events are supported on each card. */
		if ( (papi_errorcode = detectDevices( ) ) != PAPI_OK ) {
			papi_free(features);
			papi_free(devices);
			sprintf(_nvml_vector.cmp_info.disabled_reason, "An error occured in device feature detection, please check your NVIDIA Management Library and CUDA install." );
			goto disable;
		}

		/* The assumption is that if everything went swimmingly in detectDevices, 
			all nvml calls here should be fine. */
		createNativeEvents( );

		/* Export the total number of events available */
		_nvml_vector.cmp_info.num_native_events = num_events;

		/* Export the component id */
		_nvml_vector.cmp_info.CmpIdx = cidx;

		/* Export the number of 'counters' */
		_nvml_vector.cmp_info.num_cntrs = num_events;

		return PAPI_OK;

disable:
		_nvml_vector.cmp_info.num_cntrs = 0;
		return PAPI_OK;	
}
Exemple #9
0
static int
hwloc_nvml_discover(struct hwloc_backend *backend)
{
  struct hwloc_topology *topology = backend->topology;
  nvmlReturn_t ret;
  unsigned nb, i;

  if (!(hwloc_topology_get_flags(topology) & (HWLOC_TOPOLOGY_FLAG_IO_DEVICES|HWLOC_TOPOLOGY_FLAG_WHOLE_IO)))
    return 0;

  if (!hwloc_topology_is_thissystem(topology)) {
    hwloc_debug("%s", "\nno NVML detection (not thissystem)\n");
    return 0;
  }

  ret = nvmlInit();
  if (NVML_SUCCESS != ret)
    return 0;
  ret = nvmlDeviceGetCount(&nb);
  if (NVML_SUCCESS != ret || !nb) {
    nvmlShutdown();
    return 0;
  }

  for(i=0; i<nb; i++) {
    nvmlPciInfo_t pci;
    nvmlDevice_t device;
    hwloc_obj_t osdev, parent;
    char buffer[64];

    ret = nvmlDeviceGetHandleByIndex(i, &device);
    assert(ret == NVML_SUCCESS);

    osdev = hwloc_alloc_setup_object(HWLOC_OBJ_OS_DEVICE, -1);
    snprintf(buffer, sizeof(buffer), "nvml%d", i);
    osdev->name = strdup(buffer);
    osdev->depth = (unsigned) HWLOC_TYPE_DEPTH_UNKNOWN;
    osdev->attr->osdev.type = HWLOC_OBJ_OSDEV_GPU;

    hwloc_obj_add_info(osdev, "Backend", "NVML");
    hwloc_obj_add_info(osdev, "GPUVendor", "NVIDIA Corporation");

    buffer[0] = '\0';
    ret = nvmlDeviceGetName(device, buffer, sizeof(buffer));
    hwloc_obj_add_info(osdev, "GPUModel", buffer);

    /* these may fail with NVML_ERROR_NOT_SUPPORTED on old devices */
    buffer[0] = '\0';
    ret = nvmlDeviceGetSerial(device, buffer, sizeof(buffer));
    if (buffer[0] != '\0')
      hwloc_obj_add_info(osdev, "NVIDIASerial", buffer);

    buffer[0] = '\0';
    ret = nvmlDeviceGetUUID(device, buffer, sizeof(buffer));
    if (buffer[0] != '\0')
      hwloc_obj_add_info(osdev, "NVIDIAUUID", buffer);

    parent = NULL;
    if (NVML_SUCCESS == nvmlDeviceGetPciInfo(device, &pci)) {
      parent = hwloc_pci_belowroot_find_by_busid(topology, pci.domain, pci.bus, pci.device, 0);
      if (!parent)
	parent = hwloc_pci_find_busid_parent(topology, pci.domain, pci.bus, pci.device, 0);
#if HAVE_DECL_NVMLDEVICEGETMAXPCIELINKGENERATION
      if (parent && parent->type == HWLOC_OBJ_PCI_DEVICE) {
	unsigned maxwidth = 0, maxgen = 0;
	float lanespeed;
	nvmlDeviceGetMaxPcieLinkWidth(device, &maxwidth);
	nvmlDeviceGetMaxPcieLinkGeneration(device, &maxgen);
	/* PCIe Gen1 = 2.5GT/s signal-rate per lane with 8/10 encoding    = 0.25GB/s data-rate per lane
	 * PCIe Gen2 = 5  GT/s signal-rate per lane with 8/10 encoding    = 0.5 GB/s data-rate per lane
	 * PCIe Gen3 = 8  GT/s signal-rate per lane with 128/130 encoding = 1   GB/s data-rate per lane
	 */
	lanespeed = maxgen <= 2 ? 2.5 * maxgen * 0.8 : 8.0 * 128/130; /* Gbit/s per lane */
	if (lanespeed * maxwidth)
	  /* we found the max link speed, replace the current link speed found by pci (or none) */
	  parent->attr->pcidev.linkspeed = lanespeed * maxwidth / 8; /* GB/s */
      }
#endif
    }
    if (!parent)
      parent = hwloc_get_root_obj(topology);

    hwloc_insert_object_by_parent(topology, parent, osdev);
  }

  nvmlShutdown();
  return nb;
}
int main() {
	nvml_void nvmlInit = NULL;
	nvml_void nvmlShutdown = NULL;

	// The actual filtering is done by the startd on the basis
	// of the SlotMergeConstraint we set for each ad we emit.
#if defined(WINDOWS)
	putenv( "CUDA_VISIBLE_DEVICES=" );
#else
	unsetenv( "CUDA_VISIBLE_DEVICES" );
#endif

	void * nvml_handle = NULL;
	const char * nvml_library = "libnvidia-ml.so";
	nvml_handle = dlopen( nvml_library, RTLD_LAZY );
	if(! nvml_handle) {
		fprintf( stderr, "Unable to load %s, aborting.\n", nvml_library );
		fail();
	}
	dlerror();

	nvmlInit                   = (nvml_void)dlsym( nvml_handle, "nvmlInit" );
	nvmlShutdown               = (nvml_void)dlsym( nvml_handle, "nvmlShutdown" );
	nvmlDeviceGetCount         = (nvml_unsigned_int)dlsym( nvml_handle, "nvmlDeviceGetCount" );
	nvmlDeviceGetSamples       = (nvml_dgs)dlsym( nvml_handle, "nvmlDeviceGetSamples" );
	nvmlDeviceGetMemoryInfo    = (nvml_dm)dlsym( nvml_handle, "nvmlDeviceGetMemoryInfo" );
	nvmlDeviceGetHandleByIndex = (nvml_dghbi)dlsym( nvml_handle, "nvmlDeviceGetHandleByIndex" );
	nvmlErrorString            = (cc_nvml)dlsym( nvml_handle, "nvmlErrorString" );

	nvmlReturn_t r = nvmlInit();
	if( r != NVML_SUCCESS ) {
		fprintf( stderr, "nvmlInit() failed, aborting.\n" );
		fail();
	}

	unsigned int deviceCount;
	r = nvmlDeviceGetCount( &deviceCount );
	if( r != NVML_SUCCESS ) {
		fprintf( stderr, "nvmlDeviceGetCount() failed, aborting.\n" );
		fail();
	}
	if( deviceCount <= 0 ) {
		fprintf( stderr, "Found 0 or fewer devices, aborting.\n" );
		fail();
	}

	nvmlDevice_t devices[deviceCount];
	unsigned maxSampleCounts[deviceCount];
	unsigned long long memoryUsage[deviceCount];
	unsigned long long lastSamples[deviceCount];
	unsigned long long firstSamples[deviceCount];
	unsigned long long elapsedTimes[deviceCount];
	for( unsigned i = 0; i < deviceCount; ++i ) {
		r = nvmlDeviceGetHandleByIndex( i, &(devices[i]) );
		if( r != NVML_SUCCESS ) {
			fprintf( stderr, "nvmlGetDeviceHandleByIndex(%u) failed (%d: %s), aborting.\n", i, r, nvmlErrorString( r ) );
			fail();
		}

		lastSamples[i] = 0;
		firstSamples[i] = 0;
		elapsedTimes[i] = 0;

		nvmlValueType_t sampleValueType;
		r = nvmlDeviceGetSamples( devices[i], NVML_GPU_UTILIZATION_SAMPLES, 0, & sampleValueType, & maxSampleCounts[i], NULL );
		if( r != NVML_SUCCESS ) {
			fprintf( stderr, "nvmlDeviceGetSamples(%u) failed while querying for the max sample count (%d: %s), aborting.\n", i, r, nvmlErrorString( r ) );
			fail();
		}
		if( sampleValueType != NVML_VALUE_TYPE_UNSIGNED_INT ) {
			fprintf( stderr, "nvmlDeviceGetSamples(%u) returned an unexpected type (%d) of sample when querying for the max sample count, aborting.\n", i, sampleValueType );
			fail();
		}

		memoryUsage[i] = 0;
	}

	// We deliberately ignore the first set of samples.  Partly, I think we
	// can claim that they're from before we started monitoring.  More
	// importantly, at least on a Tesla K40c (driver 384.81), initializing
	// the NVML library causes a one-second 99% usage spike on an other-
	// wise idle GPU.  So we'll ignore as much of that as we easily can.
	for( unsigned i = 0; i < deviceCount; ++i ) {
		getElapsedTimeForDevice( devices[i], &lastSamples[i], &elapsedTimes[i], maxSampleCounts[i] );
		firstSamples[i] = lastSamples[i];
		elapsedTimes[i] = 0;
	}

	time_t lastReport = time( NULL );
	while( 1 ) {
		usleep( 100000 );

		for( unsigned i = 0; i < deviceCount; ++i ) {
			r = getElapsedTimeForDevice( devices[i], &lastSamples[i], &elapsedTimes[i], maxSampleCounts[i] );
			if( r != NVML_SUCCESS ) {
				fprintf( stderr, "getElapsedTimeForDevice(%u) failed (%d: %s), aborting.\n", i, r, nvmlErrorString( r ) );
				fail();
			}

			if( debug ) {
				fprintf( stdout, "device %u: %llu / %llu = %u%%.\n",
					i, elapsedTimes[i], lastSamples[i] - firstSamples[i],
					(unsigned)(((double)elapsedTimes[i]) / (lastSamples[i] - firstSamples[i]) * 100)
				);
			}

			nvmlMemory_t mi = { 0, 0, 0 };
			r = nvmlDeviceGetMemoryInfo( devices[i], &mi );
			if( r != NVML_SUCCESS ) {
				fprintf( stderr, "getDeviceGetMemoryInfo(%u) failed (%d: %s), aborting.\n", i, r, nvmlErrorString( r ) );
				fail();
			}
			if( mi.used > memoryUsage[i] ) {
				memoryUsage[i] = mi.used;
			}
		}

		if( time( NULL ) - lastReport >= reportInterval ) {
			for( unsigned i = 0; i < deviceCount; ++i ) {
				fprintf( stdout, "SlotMergeConstraint = StringListMember( \"CUDA%u\", AssignedGPUs )\n", i );
				fprintf( stdout, "UptimeGPUsSeconds = %.6f\n", elapsedTimes[i] / 1000000.0 );
				fprintf( stdout, "UptimeGPUsMemoryPeakUsage = %llu\n", (memoryUsage[i] + (1024 * 1024) -1) / (1024 * 1024) );
				fprintf( stdout, "- GPUsSlot%u\n", i );
				fflush( stdout );

				// Report only the usage for each reporting period.
				elapsedTimes[i] = 0;
				firstSamples[i] = lastSamples[i];
				memoryUsage[i] = 0;
			}
			lastReport = time( NULL );
		}
	}

	r = nvmlShutdown();
	if( r != NVML_SUCCESS ) {
		fprintf( stderr, "nvmlShutdown() failed (%d: %s), aborting.\n", r, nvmlErrorString( r ) );
		return 1;
	}

	dlclose( nvml_handle );
	return 0;
}
Exemple #11
0
static int get_process_info(unsigned int*ncores,unsigned int *valarray)
{

    nvmlReturn_t ret;

    ret=nvmlInit();

    if(ret!=NVML_SUCCESS) {
        fprintf(stderr,"ERROR:: Initialize NVML{%s}..\n",nvmlErrorString(ret));
        return -1;
    }


    unsigned int c;

    ret=nvmlDeviceGetCount(&c);
    if(ret!=NVML_SUCCESS) {
        fprintf(stderr,"ERROR:: Device Get Count{%s}..\n",nvmlErrorString(ret));
        return -1;
    }

    *ncores=c;
    /*
        if(c!=NDEV){
            fprintf(stderr,"ERROR:: Current number of Cores is [%d],not %d....YOU NEED RECOMPILE THIS ROUTINE\n",c,NDEV);
            return -2;
        }
    */
    nvmlDevice_t devs[NDEV];

    nvmlProcessInfo_t pis[MAXPROC];


    int i;
    for(i=0; i<c; i++) {

        ret=nvmlDeviceGetHandleByIndex(i,&devs[i]);
        if(ret!=NVML_SUCCESS) {
            fprintf(stderr,"ERROR:: Device Get Handle{%s}..\n",nvmlErrorString(ret));
            return -1;
        }

        unsigned int np=MAXPROC;
        ret=nvmlDeviceGetComputeRunningProcesses(devs[i],&np,pis);
        if(ret!=NVML_SUCCESS) {
            fprintf(stderr,"ERROR:: GetRunningProcess{%s}..\n",nvmlErrorString(ret));
            return -1;
        }
        valarray[i]=np;

    }

    ret=nvmlShutdown();

    if(ret!=NVML_SUCCESS) {
        fprintf(stderr,"ERROR:: Shutdown NVML{%s}..\n",nvmlErrorString(ret));
        return -1;
    }

    return 0;


}
Exemple #12
0
	void CMeasureNVML<TSkipMs, TVariant>::init(void) {
		if(TVariant == VARIANT_FULL) {
			mrLog()
			<< ">>> 'nvml' (full version)" << std::endl;
		} else {
			mrLog()
			<< ">>> 'nvml' (light version)" << std::endl;
		}
		
		nvmlReturn_t result;
		int32_t rv;
		char const* args_set_pm[] = {"gpu_management", "-p 1", NULL};
		
		uint32_t device_count;
		char name[NVML_DEVICE_NAME_BUFFER_SIZE];
		nvmlPciInfo_t pci;
		nvmlEnableState_t mode;
		std::string modes[2] = {"disabled", "enabled"};
		std::stringstream clk_gpu_str;
		std::stringstream clk_mem_str;
		nvmlPstates_t power_state;
		nvmlMemory_t memory;
		
		const uint32_t count			= 32;
		uint32_t clk_mem_cnt			= count;
		uint32_t clk_mem[count];
		uint32_t clk_mem_max			= 0;
		uint32_t clk_mem_min			= 0xffffffff;
		uint32_t clk_mem_set			= 0;
		uint32_t clk_gpu_min_arr_cnt	= count;
		uint32_t clk_gpu_min_arr[clk_gpu_min_arr_cnt];
		uint32_t clk_gpu_min			= 0xffffffff;
		uint32_t clk_gpu_max_arr_cnt	= count;
		uint32_t clk_gpu_max_arr[clk_gpu_max_arr_cnt];
		uint32_t clk_gpu_max			= 0;
		uint32_t clk_gpu_set			= 0;
		uint32_t memory_total			= 0;
		
		result = nvmlInit();
		if (NVML_SUCCESS != result) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot initialize nvml library. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		result = nvmlDeviceGetCount(&device_count);
		if (NVML_SUCCESS != result) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot query device count. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		if (device_count > 1) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: this software has be rewritten if you want to support more than 1 device. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		mrLog() << ">>> 'nvml' (thread main): get gpu device handler...";
		mrLog.flush();
		
		result = nvmlDeviceGetHandleByIndex(0, &mDevice);
		if (NVML_SUCCESS != result) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot get device handler. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		mrLog() << " done!" << std::endl;
		
		result = nvmlDeviceGetName(mDevice, name, NVML_DEVICE_NAME_BUFFER_SIZE);
		if (NVML_SUCCESS != result) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot get device name. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		result = nvmlDeviceGetPciInfo(mDevice, &pci);
		if (NVML_SUCCESS != result) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot get pci information. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		result = nvmlDeviceGetPowerManagementMode(mDevice, &mode);
		if (NVML_SUCCESS != result) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: no power managment supported. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		result = nvmlDeviceGetPerformanceState(mDevice, &power_state);
		if (NVML_SUCCESS != result) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: no performance state reading possible. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		result = nvmlDeviceGetSupportedMemoryClocks(mDevice, &clk_mem_cnt, clk_mem);
		if (NVML_SUCCESS != result) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot obtain memory clock. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		for (int i=0; i<(int32_t)clk_mem_cnt; ++i) {
			clk_mem_min = (clk_mem[i]<clk_mem_min) ? clk_mem[i] : clk_mem_min;
			clk_mem_max = (clk_mem[i]>clk_mem_max) ? clk_mem[i] : clk_mem_max;
		}
		
		result = nvmlDeviceGetSupportedGraphicsClocks(mDevice, clk_mem_min, &clk_gpu_min_arr_cnt, clk_gpu_min_arr);
		if (NVML_SUCCESS != result) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot obtain graphics clock. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		for (int32_t i=0; i<(int32_t)clk_gpu_min_arr_cnt; ++i) {
			clk_gpu_min = (clk_gpu_min_arr[i]<clk_gpu_min) ? clk_gpu_min_arr[i] : clk_gpu_min;
		}
		
		result = nvmlDeviceGetSupportedGraphicsClocks(mDevice, clk_mem_max, &clk_gpu_max_arr_cnt, clk_gpu_max_arr);
		if (NVML_SUCCESS != result) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: cannot obtain graphics clock. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		
		for (int32_t i=0; i<(int32_t)clk_gpu_max_arr_cnt; ++i) {
			clk_gpu_max = (clk_gpu_max_arr[i]>clk_gpu_max) ? clk_gpu_max_arr[i] : clk_gpu_max;
		}
		
		result = nvmlDeviceGetMemoryInfo(mDevice, &memory);
		if (NVML_SUCCESS != result) {
			mrLog.lock();
			mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread main): Error: cannot obtain memory informations. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			mrLog.unlock();
			exit(EXIT_FAILURE);
		}
		memory_total = (uint32_t)(memory.total >> 20);
		
		rv = exec_gpu_mgmt((char**)args_set_pm);
		if (rv) {
			mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: in gpu_management tool. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
			exit(EXIT_FAILURE);
		}
		mrLog()
		<< ">>> 'nvml' (thread main): persistence mode enabled." << std::endl;
		
		mrLog()
		<< ">>> 'nvml' (thread main):" << std::endl
		<< "     device         : " << name << std::endl
		<< "     pcie           : " << pci.busId << std::endl
		<< "     power mgmt mode: " << modes[mode] << std::endl
		<< "     power state cur: " << power_state << std::endl
		<< "     power state min: " << NVML_PSTATE_15 << std::endl
		<< "     power state max: " << NVML_PSTATE_0 << std::endl
		<< "     memory total   : " << memory_total << " MiB" << std::endl
		<< "     avail mem clks : ";
		for (int i=0; i<(int32_t)clk_mem_cnt; ++i) {
			if (i<(int32_t)clk_mem_cnt-1) {
				mrLog() << clk_mem[i] << " MHz, ";
			} else {
				mrLog() << clk_mem[i] << " MHz" << std::endl;
			}
		}
		
		mrLog()
		<< "     memory clk min : " << clk_mem_min << " MHz" << std::endl
		<< "     avail core clks: ";
		for (int32_t i=0; i<(int32_t)clk_gpu_min_arr_cnt; ++i) {
			if (i<(int32_t)clk_gpu_min_arr_cnt-1) {
				mrLog() << clk_gpu_min_arr[i] << " MHz, ";
			} else {
				mrLog() << clk_gpu_min_arr[i] << " MHz" << std::endl;
			}
		}
		mrLog()
		<< "     core clk min   : " << clk_gpu_min << " MHz" << std::endl;
		
		mrLog()
		<< "     memory clk max : " << clk_mem_max << " MHz" << std::endl
		<< "     avail core clks: ";
		for (int32_t i=0; i<(int32_t)clk_gpu_max_arr_cnt; ++i) {
			if (i<(int32_t)clk_gpu_max_arr_cnt-1) {
				mrLog() << clk_gpu_max_arr[i] << " MHz, ";
			} else {
				mrLog() << clk_gpu_max_arr[i] << " MHz" << std::endl;
			}
		}
		mrLog()
		<< "     core clk max   : " << clk_gpu_max << " MHz" << std::endl;
		
		switch (mGpuFrequency) {
			case GPU_FREQUENCY_MIN:
				clk_mem_set = clk_mem_min;
				clk_gpu_set = clk_gpu_min;
				break;
			case GPU_FREQUENCY_MAX:
				clk_mem_set = clk_mem_max;
				clk_gpu_set = clk_gpu_max;
				break;
			case GPU_FREQUENCY_CUR:
			default:
				clk_mem_set = 0;
				clk_gpu_set = 0;
				
				result = nvmlDeviceGetClockInfo(mDevice, NVML_CLOCK_MEM, &clk_mem_set);
				if (NVML_SUCCESS != result) {
					mrLog.lock();
					mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread main): Error: cannot read frequency. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
					mrLog.unlock();
					exit(EXIT_FAILURE);
				}
				
				result = nvmlDeviceGetClockInfo(mDevice, NVML_CLOCK_GRAPHICS, &clk_gpu_set);
				if (NVML_SUCCESS != result) {
					mrLog.lock();
					mrLog(CLogger::scErr) << "!!! 'nvml thread' (thread main): Error: cannot read frequency. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
					mrLog.unlock();
					exit(EXIT_FAILURE);
				}
				
				break;
		}
		
		if (mGpuFrequency == GPU_FREQUENCY_MIN || mGpuFrequency == GPU_FREQUENCY_MAX) {
			// In these cases we actually set the GPU frequencies either to the maximum or minimum value.
			clk_gpu_str << "-c " << clk_gpu_set;
			clk_mem_str << "-m " << clk_mem_set;
			char const* args_set_clk[] = {"gpu_management", clk_gpu_str.str().c_str() , clk_mem_str.str().c_str(), NULL};
			rv = exec_gpu_mgmt((char**)args_set_clk);
			if (rv) {
				mrLog(CLogger::scErr) << "!!! 'nvml' (thread main): Error: in gpu_management tool. (file: " << __FILE__ << ", line: " << __LINE__ << ")" << std::endl;
				exit(EXIT_FAILURE);
			}
						
			mrLog()
			<< ">>> 'nvml' (thread main): set core clk to " << clk_gpu_set << " MHz and mem clk to " << clk_mem_set << " MHz." << std::endl;
		} else {
			// We name the values *_set, but we don't set the frequency. We just print the current GPU frequency.
			mrLog()
			<< ">>> 'nvml' (thread main): current core clk is " << clk_gpu_set << " MHz and mem clk is " << clk_mem_set << " MHz." << std::endl;
		}
		mrLog()
		<< ">>> 'nvml' (thread main): wait for 15s to throttle gpu clocks." << std::endl;
		sleep(15);
		
		mrLog()
		<< ">>> 'nvml' (thread main): initialization done." << std::endl
		<< std::endl;
	}
int Machine::initializeNVIDIADevices(hwloc_obj_t machine_obj, hwloc_topology_t topology)
  {
  nvmlReturn_t rc;

  /* Initialize the NVML handle. 
   *
   * nvmlInit should be called once before invoking any other methods in the NVML library. 
   * A reference count of the number of initializations is maintained. Shutdown only occurs 
   * when the reference count reaches zero.
   * */
  rc = nvmlInit();
  if (rc != NVML_SUCCESS && rc != NVML_ERROR_ALREADY_INITIALIZED)
    {
    log_nvml_error(rc, NULL, __func__);
    return(PBSE_NONE);
    }

  unsigned int device_count = 0;

  /* Get the device count. */
  rc = nvmlDeviceGetCount(&device_count);
  if (rc == NVML_SUCCESS)
    {
    nvmlDevice_t gpu;

    /* Get the nvml device handle at each index */
    for (unsigned int idx = 0; idx < device_count; idx++)
      {
      rc = nvmlDeviceGetHandleByIndex(idx, &gpu);

      if (rc != NVML_SUCCESS)
        {
        /* TODO: get gpuid from nvmlDevice_t struct */
        log_nvml_error(rc, NULL, __func__);
        }

      /* Use the hwloc library to determine device locality */
      hwloc_obj_t gpu_obj;
      hwloc_obj_t ancestor_obj;
      int is_in_tree;
  
      gpu_obj = hwloc_nvml_get_device_osdev(topology, gpu);
      if (gpu_obj == NULL)
        {
        /* This was not an nvml device. We will look for a "card" device (GeForce or Quadra) */
        gpu_obj = this->get_non_nvml_device(topology, gpu);
        if (gpu_obj == NULL)
        continue;
        }
        
      /* The ancestor was not a numa chip. Is it the machine? */
      ancestor_obj = hwloc_get_ancestor_obj_by_type(topology, HWLOC_OBJ_MACHINE, gpu_obj);
      if (ancestor_obj != NULL)
        {
        PCI_Device new_device;
  
        new_device.initializePCIDevice(gpu_obj, idx, topology);

        store_device_on_appropriate_chip(new_device);
        }
      }
    }
  else
    {
    log_nvml_error(rc, NULL, __func__);
    }

  /* Shutdown the NVML handle. 
   *
   * nvmlShutdown should be called after NVML work is done, once for each call to nvmlInit() 
   * A reference count of the number of initializations is maintained. Shutdown only occurs when 
   * the reference count reaches zero. For backwards compatibility, no error is reported if 
   * nvmlShutdown() is called more times than nvmlInit().
   * */
  rc = nvmlShutdown();
  if (rc != NVML_SUCCESS)
    {
    log_nvml_error(rc, NULL, __func__);
    }

  return(PBSE_NONE);
  }
Exemple #14
0
int main()
{
    nvmlReturn_t result;
    unsigned int device_count, i;

    // First initialize NVML library
    result = nvmlInit();
    if (NVML_SUCCESS != result)
    { 
        printf("Failed to initialize NVML: %s\n", nvmlErrorString(result));

        printf("Press ENTER to continue...\n");
        getchar();
        return 1;
    }

    result = nvmlDeviceGetCount(&device_count);
    if (NVML_SUCCESS != result)
    { 
        printf("Failed to query device count: %s\n", nvmlErrorString(result));
        goto Error;
    }
    printf("Found %d device%s\n\n", device_count, device_count != 1 ? "s" : "");

    printf("Listing devices:\n");    
    for (i = 0; i < device_count; i++)
    {
        nvmlDevice_t device;
        char name[NVML_DEVICE_NAME_BUFFER_SIZE];
        nvmlPciInfo_t pci;
        nvmlComputeMode_t compute_mode;

        // Query for device handle to perform operations on a device
        // You can also query device handle by other features like:
        // nvmlDeviceGetHandleBySerial
        // nvmlDeviceGetHandleByPciBusId
        result = nvmlDeviceGetHandleByIndex(i, &device);
        if (NVML_SUCCESS != result)
        { 
            printf("Failed to get handle for device %i: %s\n", i, nvmlErrorString(result));
            goto Error;
        }

        result = nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
        if (NVML_SUCCESS != result)
        { 
            printf("Failed to get name of device %i: %s\n", i, nvmlErrorString(result));
            goto Error;
        }
        
        // pci.busId is very useful to know which device physically you're talking to
        // Using PCI identifier you can also match nvmlDevice handle to CUDA device.
        result = nvmlDeviceGetPciInfo(device, &pci);
        if (NVML_SUCCESS != result)
        { 
            printf("Failed to get pci info for device %i: %s\n", i, nvmlErrorString(result));
            goto Error;
        }

        printf("%d. %s [%s]\n", i, name, pci.busId);

        // This is a simple example on how you can modify GPU's state
        result = nvmlDeviceGetComputeMode(device, &compute_mode);
        if (NVML_ERROR_NOT_SUPPORTED == result)
            printf("\t This is not CUDA capable device\n");
        else if (NVML_SUCCESS != result)
        { 
            printf("Failed to get compute mode for device %i: %s\n", i, nvmlErrorString(result));
            goto Error;
        }
        else
        {
            // try to change compute mode
            printf("\t Changing device's compute mode from '%s' to '%s'\n", 
                    convertToComputeModeString(compute_mode), 
                    convertToComputeModeString(NVML_COMPUTEMODE_PROHIBITED));

            result = nvmlDeviceSetComputeMode(device, NVML_COMPUTEMODE_PROHIBITED);
            if (NVML_ERROR_NO_PERMISSION == result)
                printf("\t\t Need root privileges to do that: %s\n", nvmlErrorString(result));
            else if (NVML_ERROR_NOT_SUPPORTED == result)
                printf("\t\t Compute mode prohibited not supported. You might be running on\n"
                       "\t\t windows in WDDM driver model or on non-CUDA capable GPU.\n");
            else if (NVML_SUCCESS != result)
            {
                printf("\t\t Failed to set compute mode for device %i: %s\n", i, nvmlErrorString(result));
                goto Error;
            } 
            else
            {
                printf("\t Restoring device's compute mode back to '%s'\n", 
                        convertToComputeModeString(compute_mode));
                result = nvmlDeviceSetComputeMode(device, compute_mode);
                if (NVML_SUCCESS != result)
                { 
                    printf("\t\t Failed to restore compute mode for device %i: %s\n", i, nvmlErrorString(result));
                    goto Error;
                }
            }
        }
    }

    result = nvmlShutdown();
    if (NVML_SUCCESS != result)
        printf("Failed to shutdown NVML: %s\n", nvmlErrorString(result));

    printf("All done.\n");

    printf("Press ENTER to continue...\n");
    getchar();
    return 0;

Error:
    result = nvmlShutdown();
    if (NVML_SUCCESS != result)
        printf("Failed to shutdown NVML: %s\n", nvmlErrorString(result));

    printf("Press ENTER to continue...\n");
    getchar();
    return 1;
}