Exemplo n.º 1
0
static int get_mem_info(unsigned int*ncores,unsigned int*usedarray)
{

    nvmlReturn_t ret;
    ret=nvmlInit();


    if(ret!=NVML_SUCCESS) {
        fprintf(stderr,"ERROR:: Initialize NVML{%s}..\n",nvmlErrorString(ret));
        return -1;
    }


    unsigned int c;

    ret=nvmlDeviceGetCount(&c);
    if(ret!=NVML_SUCCESS) {
        fprintf(stderr,"ERROR:: Device Get Count{%s}..\n",nvmlErrorString(ret));
        return -1;
    }

    *ncores=c;

    nvmlDevice_t devs[NDEV];
    nvmlMemory_t meminfo;


    int i;
    for(i=0; i<c; i++) {

        ret=nvmlDeviceGetHandleByIndex(i,&devs[i]);
        if(ret!=NVML_SUCCESS) {
            fprintf(stderr,"ERROR:: Device Get Handle{%s}..\n",nvmlErrorString(ret));
            return -1;
        }

        ret=nvmlDeviceGetMemoryInfo(devs[i],&meminfo);
        if(ret!=NVML_SUCCESS) {
            fprintf(stderr,"ERROR:: GetMemoryInfo{%s}..\n",nvmlErrorString(ret));
            return -1;
        }
        usedarray[i]=meminfo.used;

    }

    ret=nvmlShutdown();

    if(ret!=NVML_SUCCESS) {
        fprintf(stderr,"ERROR:: Shutdown NVML{%s}..\n",nvmlErrorString(ret));
        return -1;
    }

    return 0;

}
Exemplo n.º 2
0
		unsigned long long
getEccLocalErrors( nvmlDevice_t dev, nvmlEccBitType_t bits, int which_one)
{
		nvmlEccErrorCounts_t counts;

		nvmlReturn_t bad; 
		bad = nvmlDeviceGetDetailedEccErrors( dev, bits, NVML_VOLATILE_ECC , &counts);

		if ( NVML_SUCCESS != bad ) {
				SUBDBG( "something went wrong %s\n", nvmlErrorString(bad));
		}


		switch ( which_one ) {
				case LOCAL_ECC_REGFILE:
						return counts.registerFile;
				case LOCAL_ECC_L1:
						return counts.l1Cache;
				case LOCAL_ECC_L2:
						return counts.l2Cache;
				case LOCAL_ECC_MEM:
						return counts.deviceMemory;
				default:
						;
		}
		return (unsigned long long)-1;
}
Exemplo n.º 3
0
static void check_nvml_error(nvmlReturn_t nvret,const char*msg,int line)
{

    if(nvret!=NVML_SUCCESS){

        fprintf(stderr,"NVML:@%d::%s:{%s}\n",line,msg,nvmlErrorString(nvret));
        exit(1);
    }

}
Exemplo n.º 4
0
		unsigned long long
getPState( nvmlDevice_t dev ) 
{
		unsigned int ret = 0;
		nvmlPstates_t state = NVML_PSTATE_15;
		nvmlReturn_t bad; 
		bad = nvmlDeviceGetPerformanceState( dev, &state );

		if ( NVML_SUCCESS != bad ) {
				SUBDBG( "something went wrong %s\n", nvmlErrorString(bad));
		}


		switch ( state ) {
				case NVML_PSTATE_15:
						ret++;
				case NVML_PSTATE_14:
						ret++;
				case NVML_PSTATE_13:
						ret++;
				case NVML_PSTATE_12:
						ret++;
				case NVML_PSTATE_11:
						ret++;
				case NVML_PSTATE_10:
						ret++;
				case NVML_PSTATE_9:
						ret++;
				case NVML_PSTATE_8:
						ret++;
				case NVML_PSTATE_7:
						ret++;
				case NVML_PSTATE_6:
						ret++;
				case NVML_PSTATE_5:
						ret++;
				case NVML_PSTATE_4:
						ret++;
				case NVML_PSTATE_3:
						ret++;
				case NVML_PSTATE_2:
						ret++;
				case NVML_PSTATE_1:
						ret++;
				case NVML_PSTATE_0:
						break;
				case NVML_PSTATE_UNKNOWN:
				default:
						/* This should never happen? 
						 * The API docs just state Unknown performance state... */
						return (unsigned long long) -1;
		}

		return (unsigned long long)ret;
}
Exemplo n.º 5
0
unsigned long long
getClockSpeed( nvmlDevice_t dev, nvmlClockType_t which_one )
{
		unsigned int ret = 0;
		nvmlReturn_t bad; 
		bad = nvmlDeviceGetClockInfo( dev, which_one, &ret );

		if ( NVML_SUCCESS != bad ) {
				SUBDBG( "something went wrong %s\n", nvmlErrorString(bad));
		}

		return (unsigned long long)ret;
}
Exemplo n.º 6
0
		unsigned long long
getTotalEccErrors( nvmlDevice_t dev, nvmlEccBitType_t bits) 
{
		unsigned long long counts = 0;
		nvmlReturn_t bad; 
		bad = nvmlDeviceGetTotalEccErrors( dev, bits, NVML_VOLATILE_ECC , &counts);

		if ( NVML_SUCCESS != bad ) {
				SUBDBG( "something went wrong %s\n", nvmlErrorString(bad));
		}


		return counts;
}
Exemplo n.º 7
0
		unsigned long long
getTemperature( nvmlDevice_t dev )
{
		unsigned int ret = 0;
		nvmlReturn_t bad; 
		bad = nvmlDeviceGetTemperature( dev, NVML_TEMPERATURE_GPU, &ret );

		if ( NVML_SUCCESS != bad ) {
				SUBDBG( "something went wrong %s\n", nvmlErrorString(bad));
		}


		return (unsigned long long)ret;
}
Exemplo n.º 8
0
		unsigned long long
getPowerUsage( nvmlDevice_t dev )
{
		unsigned int power;
		nvmlReturn_t bad; 
		bad = nvmlDeviceGetPowerUsage( dev, &power );

		if ( NVML_SUCCESS != bad ) {
				SUBDBG( "something went wrong %s\n", nvmlErrorString(bad));
		}


		return (unsigned long long) power;
}
Exemplo n.º 9
0
		unsigned long long 
getFanSpeed( nvmlDevice_t dev ) 
{
		unsigned int ret = 0;
		nvmlReturn_t bad; 
		bad = nvmlDeviceGetFanSpeed( dev, &ret );

		if ( NVML_SUCCESS != bad ) {
				SUBDBG( "something went wrong %s\n", nvmlErrorString(bad));
		}


		return (unsigned long long)ret; 
}
Exemplo n.º 10
0
// NVIDIA NVML library function wrapper for GPU DVFS.
int SetGPUFreq(unsigned int clock_mem, unsigned int clock_core) {
    nvmlDevice_t device;//int device;
    nvmlReturn_t result;
    result = nvmlInit();
    result = nvmlDeviceGetHandleByIndex(0, &device);//cudaGetDevice(&device);
    result = nvmlDeviceSetApplicationsClocks(device, clock_mem, clock_core);//(nvmlDevice_t)device
    if(result != NVML_SUCCESS)
    {
        printf("Failed to set GPU core and memory frequencies: %s\n", nvmlErrorString(result));
        return 1;
    }
    else
    {
        nvmlDeviceGetApplicationsClock(device, NVML_CLOCK_GRAPHICS, &clock_core);
        nvmlDeviceGetApplicationsClock(device, NVML_CLOCK_MEM, &clock_mem);
        ////printf("GPU core frequency is now set to %d MHz; GPU memory frequency is now set to %d MHz", clock_core, clock_mem);
        return 0;
    }
}
Exemplo n.º 11
0
/* 	0 => gpu util
	1 => memory util
 */
		unsigned long long
getUtilization( nvmlDevice_t dev, int which_one )
{
		nvmlUtilization_t util;
		nvmlReturn_t bad; 
		bad = nvmlDeviceGetUtilizationRates( dev, &util );

		if ( NVML_SUCCESS != bad ) {
				SUBDBG( "something went wrong %s\n", nvmlErrorString(bad));
		}


		switch (which_one) {
				case GPU_UTILIZATION:
						return (unsigned long long) util.gpu;
				case MEMORY_UTILIZATION:
						return (unsigned long long) util.memory;
				default:
						;
		}

		return (unsigned long long) -1;
}
Exemplo n.º 12
0
		unsigned long long
getMemoryInfo( nvmlDevice_t dev, int which_one )
{
		nvmlMemory_t meminfo;
		nvmlReturn_t bad; 
		bad = nvmlDeviceGetMemoryInfo( dev, &meminfo );

		if ( NVML_SUCCESS != bad ) {
				SUBDBG( "something went wrong %s\n", nvmlErrorString(bad));
		}

		switch (which_one) {
				case MEMINFO_TOTAL_MEMORY:
						return meminfo.total;
				case MEMINFO_UNALLOCED:
						return meminfo.free;
				case MEMINFO_ALLOCED:
						return meminfo.used;
				default:
						;
		}
		return (unsigned long long)-1;
}
Exemplo n.º 13
0
/*
 * Class:     org_apache_hadoop_yarn_server_nodemanager_containermanager_launcher_GPUMonitor
 * Method:    initnvml
 * Signature: ()Ljava/lang/String;
 */
JNIEXPORT jstring JNICALL Java_org_apache_hadoop_yarn_server_nodemanager_containermanager_launcher_GPUMonitor_initnvml
  (JNIEnv *env, jobject)
{
    nvmlReturn_t result;
    unsigned int device_count, i;
	char sentence[200];
	std::string err = "";

    result = nvmlInit();
    if (NVML_SUCCESS != result) { 
        printf("Failed to initialize NVML: %s\n", nvmlErrorString(result));
        sprintf(sentence, "Failed to initialize NVML: %s\n", nvmlErrorString(result));
		err.append( (std::string)sentence );
    }
	char name[NVML_DEVICE_NAME_BUFFER_SIZE];
	result = nvmlDeviceGetHandleByIndex(0, &device);
	if (NVML_SUCCESS != result) { 
		printf("Failed to get handle for device %i: %s\n", i, nvmlErrorString(result));
		sprintf(sentence,"Failed to get handle for device %i: %s\n", i, nvmlErrorString(result));
		err.append( (std::string)sentence );
		result = nvmlShutdown();
		return 0;
	}
	result = nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
	if (NVML_SUCCESS != result) { 
		printf("Failed to get name of device %i: %s\n", i, nvmlErrorString(result));
		sprintf(sentence,"Failed to get name of device %i: %s\n", i, nvmlErrorString(result));
		err.append( (std::string)sentence );
		result = nvmlShutdown();
		return 0;
	}
	printf("Device : %s\n",name);
	sprintf(sentence,"Device : %s\n",name);
	err.append( (std::string)sentence );
	return env->NewStringUTF( err.c_str() );
}
Exemplo n.º 14
0
		static int 
detectDevices( ) 
{
		nvmlReturn_t ret;
		nvmlEnableState_t mode = NVML_FEATURE_DISABLED;
		nvmlDevice_t handle;
		nvmlPciInfo_t info;

		cudaError_t cuerr;

		char busId[16];
		char name[64];
		char inforomECC[16];
		char inforomPower[16];
		char names[device_count][64];
		char nvml_busIds[device_count][16];

		float ecc_version = 0.0, power_version = 0.0;

		int i = 0,
			j = 0;
		int isTesla = 0;
		int isFermi	= 0;
		int isUnique = 1;

		unsigned int temp = 0;


		/* list of nvml pci_busids */
	for (i=0; i < device_count; i++) {
		ret = nvmlDeviceGetHandleByIndex( i, &handle );	
		if ( NVML_SUCCESS != ret ) {
			SUBDBG("nvmlDeviceGetHandleByIndex(%d) failed\n", i);
			return PAPI_ESYS;
		}

		ret = nvmlDeviceGetPciInfo( handle, &info );
		if ( NVML_SUCCESS != ret ) {
			SUBDBG("nvmlDeviceGetPciInfo() failed %s\n", nvmlErrorString(ret) );
			return PAPI_ESYS;
		}

		strncpy(nvml_busIds[i], info.busId, 16);
		
	}

	/* We want to key our list of nvmlDevice_ts by each device's cuda index */
	for (i=0; i < device_count; i++) {
			cuerr = cudaDeviceGetPCIBusId( busId, 16, i );
			if ( CUDA_SUCCESS != cuerr ) {
				SUBDBG("cudaDeviceGetPCIBusId failed.\n");
				return PAPI_ESYS;
			}
			for (j=0; j < device_count; j++ ) {
					if ( !strncmp( busId, nvml_busIds[j], 16) ) {
							ret = nvmlDeviceGetHandleByIndex(j, &devices[i] );
							if ( NVML_SUCCESS != ret )
								SUBDBG("nvmlDeviceGetHandleByIndex(%d, &devices[%d]) failed.\n", j, i);
								return PAPI_ESYS;
							break;
					}
			}	
	}

		memset(names, 0x0, device_count*64);
		/* So for each card, check whats querable */
		for (i=0; i < device_count; i++ ) {
				isTesla=0;
				isFermi=1;
				isUnique = 1;
				features[i] = 0;

				ret = nvmlDeviceGetName( devices[i], name, 64 );
				if ( NVML_SUCCESS != ret) {
					SUBDBG("nvmlDeviceGetName failed \n");
					return PAPI_ESYS;
				}

				for (j=0; j < i; j++ ) 
						if ( 0 == strncmp( name, names[j], 64 ) ) {
								/* if we have a match, and IF everything is sane, 
								 * devices with the same name eg Tesla C2075 share features */
								isUnique = 0;
								features[i] = features[j];

						}

				if ( isUnique ) {
						ret = nvmlDeviceGetInforomVersion( devices[i], NVML_INFOROM_ECC, inforomECC, 16);
						if ( NVML_SUCCESS != ret ) {
								SUBDBG("nvmlGetInforomVersion carps %s\n", nvmlErrorString(ret ) );
								isFermi = 0;
						}
						ret = nvmlDeviceGetInforomVersion( devices[i], NVML_INFOROM_POWER, inforomPower, 16);
						if ( NVML_SUCCESS != ret ) {
								/* This implies the card is older then Fermi */
								SUBDBG("nvmlGetInforomVersion carps %s\n", nvmlErrorString(ret ) );
								SUBDBG("Based upon the return to nvmlGetInforomVersion, we conclude this card is older then Fermi.\n");
								isFermi = 0;
						} 

						ecc_version = strtof(inforomECC, NULL );
						power_version = strtof( inforomPower, NULL);

						ret = nvmlDeviceGetName( devices[i], name, 64 );
						isTesla = ( NULL == strstr(name, "Tesla") ) ? 0:1;

						/* For Tesla and Quadro products from Fermi and Kepler families. */
						if ( isFermi ) {
								features[i] |= FEATURE_CLOCK_INFO;
								num_events += 3;
						}

						/* 	For Tesla and Quadro products from Fermi and Kepler families. 
							requires NVML_INFOROM_ECC 2.0 or higher for location-based counts
							requires NVML_INFOROM_ECC 1.0 or higher for all other ECC counts
							requires ECC mode to be enabled. */
						if ( isFermi ) {
								ret = nvmlDeviceGetEccMode( devices[i], &mode, NULL );
								if ( NVML_FEATURE_ENABLED == mode) {
										if ( ecc_version >= 2.0 ) {
												features[i] |= FEATURE_ECC_LOCAL_ERRORS;
												num_events += 8; /* {single bit, two bit errors} x { reg, l1, l2, memory } */
										} 
										if ( ecc_version >= 1.0 ) {
												features[i] |= FEATURE_ECC_TOTAL_ERRORS;
												num_events += 2; /* single bit errors, double bit errors */
										}
								}	
						}

						/* For all discrete products with dedicated fans */
						features[i] |= FEATURE_FAN_SPEED;
						num_events++;

						/* For Tesla and Quadro products from Fermi and Kepler families. */
						if ( isFermi ) {
								features[i] |= FEATURE_MAX_CLOCK;
								num_events += 3;
						}

						/* For all products */
						features[i] |= FEATURE_MEMORY_INFO;
						num_events += 3; /* total, free, used */

						/* For Tesla and Quadro products from the Fermi and Kepler families. */
						if ( isFermi ) {
								features[i] |= FEATURE_PERF_STATES;
								num_events++;
						}

						/* 	For "GF11x" Tesla and Quadro products from the Fermi family
							requires NVML_INFOROM_POWER 3.0 or higher
							For Tesla and Quadro products from the Kepler family
							does not require NVML_INFOROM_POWER */
						if ( isFermi ) {
								ret = nvmlDeviceGetPowerUsage( devices[i], &temp);
								if ( NVML_SUCCESS == ret ) {
										features[i] |= FEATURE_POWER;
										num_events++;
								}
						}

						/* For all discrete and S-class products. */
						features[i] |= FEATURE_TEMP;
						num_events++;

						/* For Tesla and Quadro products from the Fermi and Kepler families */
						if (isFermi) {
								features[i] |= FEATURE_UTILIZATION;
								num_events += 2;
						}

						strncpy( names[i], name, 64); 

				}
		}
		return PAPI_OK;

}
Exemplo n.º 15
0
static int get_process_info(unsigned int*ncores,unsigned int *valarray)
{

    nvmlReturn_t ret;

    ret=nvmlInit();

    if(ret!=NVML_SUCCESS) {
        fprintf(stderr,"ERROR:: Initialize NVML{%s}..\n",nvmlErrorString(ret));
        return -1;
    }


    unsigned int c;

    ret=nvmlDeviceGetCount(&c);
    if(ret!=NVML_SUCCESS) {
        fprintf(stderr,"ERROR:: Device Get Count{%s}..\n",nvmlErrorString(ret));
        return -1;
    }

    *ncores=c;
    /*
        if(c!=NDEV){
            fprintf(stderr,"ERROR:: Current number of Cores is [%d],not %d....YOU NEED RECOMPILE THIS ROUTINE\n",c,NDEV);
            return -2;
        }
    */
    nvmlDevice_t devs[NDEV];

    nvmlProcessInfo_t pis[MAXPROC];


    int i;
    for(i=0; i<c; i++) {

        ret=nvmlDeviceGetHandleByIndex(i,&devs[i]);
        if(ret!=NVML_SUCCESS) {
            fprintf(stderr,"ERROR:: Device Get Handle{%s}..\n",nvmlErrorString(ret));
            return -1;
        }

        unsigned int np=MAXPROC;
        ret=nvmlDeviceGetComputeRunningProcesses(devs[i],&np,pis);
        if(ret!=NVML_SUCCESS) {
            fprintf(stderr,"ERROR:: GetRunningProcess{%s}..\n",nvmlErrorString(ret));
            return -1;
        }
        valarray[i]=np;

    }

    ret=nvmlShutdown();

    if(ret!=NVML_SUCCESS) {
        fprintf(stderr,"ERROR:: Shutdown NVML{%s}..\n",nvmlErrorString(ret));
        return -1;
    }

    return 0;


}
Exemplo n.º 16
0
int main()
{
    nvmlReturn_t result;
    unsigned int device_count, i;

    // First initialize NVML library
    result = nvmlInit();
    if (NVML_SUCCESS != result)
    { 
        printf("Failed to initialize NVML: %s\n", nvmlErrorString(result));

        printf("Press ENTER to continue...\n");
        getchar();
        return 1;
    }

    result = nvmlDeviceGetCount(&device_count);
    if (NVML_SUCCESS != result)
    { 
        printf("Failed to query device count: %s\n", nvmlErrorString(result));
        goto Error;
    }
    printf("Found %d device%s\n\n", device_count, device_count != 1 ? "s" : "");

    printf("Listing devices:\n");    
    for (i = 0; i < device_count; i++)
    {
        nvmlDevice_t device;
        char name[NVML_DEVICE_NAME_BUFFER_SIZE];
        nvmlPciInfo_t pci;
        nvmlComputeMode_t compute_mode;

        // Query for device handle to perform operations on a device
        // You can also query device handle by other features like:
        // nvmlDeviceGetHandleBySerial
        // nvmlDeviceGetHandleByPciBusId
        result = nvmlDeviceGetHandleByIndex(i, &device);
        if (NVML_SUCCESS != result)
        { 
            printf("Failed to get handle for device %i: %s\n", i, nvmlErrorString(result));
            goto Error;
        }

        result = nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
        if (NVML_SUCCESS != result)
        { 
            printf("Failed to get name of device %i: %s\n", i, nvmlErrorString(result));
            goto Error;
        }
        
        // pci.busId is very useful to know which device physically you're talking to
        // Using PCI identifier you can also match nvmlDevice handle to CUDA device.
        result = nvmlDeviceGetPciInfo(device, &pci);
        if (NVML_SUCCESS != result)
        { 
            printf("Failed to get pci info for device %i: %s\n", i, nvmlErrorString(result));
            goto Error;
        }

        printf("%d. %s [%s]\n", i, name, pci.busId);

        // This is a simple example on how you can modify GPU's state
        result = nvmlDeviceGetComputeMode(device, &compute_mode);
        if (NVML_ERROR_NOT_SUPPORTED == result)
            printf("\t This is not CUDA capable device\n");
        else if (NVML_SUCCESS != result)
        { 
            printf("Failed to get compute mode for device %i: %s\n", i, nvmlErrorString(result));
            goto Error;
        }
        else
        {
            // try to change compute mode
            printf("\t Changing device's compute mode from '%s' to '%s'\n", 
                    convertToComputeModeString(compute_mode), 
                    convertToComputeModeString(NVML_COMPUTEMODE_PROHIBITED));

            result = nvmlDeviceSetComputeMode(device, NVML_COMPUTEMODE_PROHIBITED);
            if (NVML_ERROR_NO_PERMISSION == result)
                printf("\t\t Need root privileges to do that: %s\n", nvmlErrorString(result));
            else if (NVML_ERROR_NOT_SUPPORTED == result)
                printf("\t\t Compute mode prohibited not supported. You might be running on\n"
                       "\t\t windows in WDDM driver model or on non-CUDA capable GPU.\n");
            else if (NVML_SUCCESS != result)
            {
                printf("\t\t Failed to set compute mode for device %i: %s\n", i, nvmlErrorString(result));
                goto Error;
            } 
            else
            {
                printf("\t Restoring device's compute mode back to '%s'\n", 
                        convertToComputeModeString(compute_mode));
                result = nvmlDeviceSetComputeMode(device, compute_mode);
                if (NVML_SUCCESS != result)
                { 
                    printf("\t\t Failed to restore compute mode for device %i: %s\n", i, nvmlErrorString(result));
                    goto Error;
                }
            }
        }
    }

    result = nvmlShutdown();
    if (NVML_SUCCESS != result)
        printf("Failed to shutdown NVML: %s\n", nvmlErrorString(result));

    printf("All done.\n");

    printf("Press ENTER to continue...\n");
    getchar();
    return 0;

Error:
    result = nvmlShutdown();
    if (NVML_SUCCESS != result)
        printf("Failed to shutdown NVML: %s\n", nvmlErrorString(result));

    printf("Press ENTER to continue...\n");
    getchar();
    return 1;
}