Ejemplo n.º 1
0
/**
 * PPU program entry point.
 */
int main(int argc, char** argv)
{
    /* Get global memory pointer */
    fixedgrid_t* const G = &G_GLOBAL;
    
    /* Iterators */
    uint32_t k, iter;
    
    /* Start wall clock timer */
    timer_start(&G->metrics.wallclock);

    /* Check dimensions */
    if(NX % BLOCK_X != 0)
    {
        fprintf(stderr, "NX must be a multiple of %d\n", BLOCK_X);
        exit(1);
    }    
    if(NY % BLOCK_Y != 0)
    {
        fprintf(stderr, "NY must be a multiple of %d\n", BLOCK_Y);
        exit(1);
    }
    if(NZ % BLOCK_Z != 0)
    {
        fprintf(stderr, "NZ must be a multiple of %d\n", BLOCK_Z);
        exit(1);
    }
    
    /* Initialize the model parameters */
    init_model(G);
    
    /* Add emissions */
    process_emissions(G);
    
    /* Print startup banner */
    print_start_banner(G);
    
    /* Store initial concentration */
    printf("Writing initial concentration data... ");
    write_conc(G, 0, 0);
    printf("done.\n");    
        
    printf("\n!!!!FIXME: Report # FPEs\n");
        
    /* BEGIN CALCULATIONS */
    for(iter=1, G->time = G->tstart; G->time < G->tend; G->time += G->dt, ++iter)
    {
        start_saprc99(G);
        
        for(k=0; k<NLOOKAT; k++)
        {
            // Copy concentration data to device
            CU_SAFE_CALL(cuMemcpyHtoD(G->dev_conc, &G->conc(0, 0, 0, MONITOR[k]), NX*NY*NZ*sizeof(real_t)));
            
            discretize_all_x(G, G->dt*0.5);
            
            discretize_all_y(G, G->dt*0.5);
            
            discretize_all_z(G, G->dt);
            
            discretize_all_y(G, G->dt*0.5);
            
            discretize_all_x(G, G->dt*0.5);
            
            // Copy updated concentrations back to host
            CU_SAFE_CALL(cuMemcpyDtoH((void*)&G->conc(0, 0, 0, MONITOR[k]), G->dev_conc_out, NX*NY*NZ*sizeof(real_t)));            
        }

        update_model(G);
        
        #if WRITE_EACH_ITER == 1
        write_conc(G, iter, 0);
        #endif

        printf("  After iteration %02d: Model time = %07.2f sec.\n", iter, iter*G->dt);
    }
    /* END CALCULATIONS */
    
    /* Store concentration */
    #if WRITE_EACH_ITER != 1
    write_conc(G, iter-1, 0);
    #endif
    
    /* Show final time */
    printf("\nFinal time: %f seconds.\n", (iter-1)*G->dt);
    
    timer_stop(&G->metrics.wallclock);
    
    /* Write metrics to CSV file */
    write_metrics_as_csv(G, "NVidia CUDA");
    
    /* Cleanup and exit */

    CU_SAFE_CALL(cuMemFree(G->dev_conc));
    CU_SAFE_CALL(cuMemFree(G->dev_wind));
    CU_SAFE_CALL(cuMemFree(G->dev_diff));
    CU_SAFE_CALL(cuMemFree(G->dev_buff));
    CU_SAFE_CALL(cuMemFree(G->dev_conc_out));
    CU_SAFE_CALL_NO_SYNC(cuCtxDetach(cu_context_global));
    
    return 0;
}
Ejemplo n.º 2
0
static void try_init( void ) {
  int deviceCount = 0;                                                    
	int syncflag;
	char *sync;
  CUresult err = cuInit(0);
	CUresult status;
  struct cudaDeviceProp deviceProp;
	if( state.init  ) { return; } // already initialised
  state.device = 0;                                                            

  if (CUDA_SUCCESS == err)                                                 
  CU_SAFE_CALL_NO_SYNC(cuDeviceGetCount(&deviceCount));                
  if (deviceCount == 0) {  error( "No device found" ); }


	if( state.target_device >= deviceCount ) {
		error( "Invalid device requested" );
	}



        CU_SAFE_CALL_NO_SYNC(cuDeviceGet(&(state.device), state.target_device));        
#ifdef USE_BOINC
	syncflag=0x4;
#else
	syncflag=0x0;
#endif

	sync = getenv("SWAN_SYNC" );
	if( sync != NULL ) {
		syncflag = atoi( sync );
		fprintf(stderr, "SWAN: Using synchronization method %d\n", syncflag );
	}

	if( getenv("SWAN_PROFILE") || getenv("CUDA_PROFILE") ) {
		state.debug = 1;
	}
	if( state.debug ) {
		printf("SWAN: Built for CUDA version %d.%d\n", CUDA_MAJOR, CUDA_MINOR );
	}

#ifdef DEV_EXTENSIONS
	syncflag |= CU_CTX_MAP_HOST;
#endif

#ifdef USE_FIXED_DEVICE
printf( "********************************************** OVERRIDING DEVICE ALLOCATION\n");
  status = cuCtxCreate( &(state.context), syncflag ,  0 ); // state.device + state.target_device );
  if ( CUDA_SUCCESS != status ) error ( "Unable to create context\n" );
#else
  status = cuCtxCreate( &(state.context), syncflag , state.target_device );


  if ( CUDA_SUCCESS != status ) {
   printf("SWAN: Failed to get requested device (compute exclusive mode). Trying any..\n" );
    int count;
    cuDeviceGetCount( &count );
    int i=0;

    while ( (i < count) &&  (status != CUDA_SUCCESS) ) {
      state.target_device = i;
      CU_SAFE_CALL_NO_SYNC(cuDeviceGet(&(state.device), state.target_device));
      status = cuCtxCreate( &(state.context), syncflag , state.target_device );
			i++;
    }
	}

#endif

  if ( CUDA_SUCCESS != status ) error ( "Unable to create context\n" );

	state.mods         = NULL;
	state.mod_names    = NULL;
	state.num_mods     = 0;
	state.funcs        = NULL;
	state.func_names   = NULL;
	state.num_funcs    = 0;

	state.init = 1;

  cudaGetDeviceProperties(&deviceProp, state.device);

	cuStreamCreate( &state.stream, 0 );
	state.device_version = (deviceProp.major * 100 + deviceProp.minor * 10);
	state.multiProcessorCount = deviceProp.multiProcessorCount;

}
Ejemplo n.º 3
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv) 
{
	CUdevice dev;
	int major = 0, minor = 0;
	int deviceCount = 0;
	char deviceName[256];

	// note your project will need to link with cuda.lib files on windows
	printf("CUDA Device Query (Driver API) statically linked version \n");

	CUresult err = cuInit(0);
    CU_SAFE_CALL_NO_SYNC(cuDeviceGetCount(&deviceCount));
	// This function call returns 0 if there are no CUDA capable devices.
	if (deviceCount == 0) {
        printf("There is no device supporting CUDA\n");
	}
    for (dev = 0; dev < deviceCount; ++dev) {
		CU_SAFE_CALL_NO_SYNC( cuDeviceComputeCapability(&major, &minor, dev) );

        if (dev == 0) {
			// This function call returns 9999 for both major & minor fields, if no CUDA capable devices are present
            if (major == 9999 && minor == 9999)
                printf("There is no device supporting CUDA.\n");
            else if (deviceCount == 1)
                printf("There is 1 device supporting CUDA\n");
            else
                printf("There are %d devices supporting CUDA\n", deviceCount);
        }
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetName(deviceName, 256, dev) );
        printf("\nDevice %d: \"%s\"\n", dev, deviceName);

    #if CUDA_VERSION >= 2020
		int driverVersion = 0;
		cuDriverGetVersion(&driverVersion);
		printf("  CUDA Driver Version:                           %d.%d\n", driverVersion/1000, driverVersion%100);
    #endif
        shrLog("  CUDA Capability Major/Minor version number:    %d.%d\n", major, minor);

		size_t totalGlobalMem;
		CU_SAFE_CALL_NO_SYNC( cuDeviceTotalMem(&totalGlobalMem, dev) );
        printf("  Total amount of global memory:                 %llu bytes\n", (unsigned long long)totalGlobalMem);

    #if CUDA_VERSION >= 2000
	    int multiProcessorCount;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev ) );
        shrLog("  Multiprocessors x Cores/MP = Cores:            %d (MP) x %d (Cores/MP) = %d (Cores)\n", 
			     multiProcessorCount, ConvertSMVer2Cores(major, minor), 
				 ConvertSMVer2Cores(major, minor) * multiProcessorCount);
	#endif

 	    int totalConstantMemory;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev ) );
        printf("  Total amount of constant memory:               %u bytes\n", totalConstantMemory);
 	    int sharedMemPerBlock;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev ) );
        printf("  Total amount of shared memory per block:       %u bytes\n", sharedMemPerBlock);
 	    int regsPerBlock;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &regsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev ) );
        printf("  Total number of registers available per block: %d\n", regsPerBlock);
 	    int warpSize;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev ) );
        printf("  Warp size:                                     %d\n",	warpSize);
 	    int maxThreadsPerBlock;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &maxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev ) );
		printf("  Maximum number of threads per block:           %d\n",	maxThreadsPerBlock);
 	    int blockDim[3];
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, dev ) );
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, dev ) );
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, dev ) );
        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n", blockDim[0], blockDim[1], blockDim[2]);
 	    int gridDim[3];
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev ) );
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev ) );
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev ) );
        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n", gridDim[0], gridDim[1], gridDim[2]);
  	    int memPitch;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev ) );
        printf("  Maximum memory pitch:                          %u bytes\n", memPitch);
  	    int textureAlign;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev ) );
        printf("  Texture alignment:                             %u bytes\n", textureAlign);
  	    int clockRate;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev ) );
        printf("  Clock rate:                                    %.2f GHz\n", clockRate * 1e-6f);
    #if CUDA_VERSION >= 2000
	    int gpuOverlap;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev ) );
        printf("  Concurrent copy and execution:                 %s\n",gpuOverlap ? "Yes" : "No");
    #endif

    #if CUDA_VERSION >= 2020
	    int kernelExecTimeoutEnabled;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev ) );
        printf("  Run time limit on kernels:                     %s\n", kernelExecTimeoutEnabled ? "Yes" : "No");
	    int integrated;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev ) );
        printf("  Integrated:                                    %s\n", integrated ? "Yes" : "No");
	    int canMapHostMemory;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev ) );
        printf("  Support host page-locked memory mapping:       %s\n", canMapHostMemory ? "Yes" : "No");
    #endif

    #if CUDA_VERSION >= 3000
	    int concurrentKernels;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev ) );
        printf("  Concurrent kernel execution:                   %s\n", concurrentKernels ? "Yes" : "No");
	    int eccEnabled;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &eccEnabled,  CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev ) );
        printf("  Device has ECC support enabled:                %s\n", eccEnabled ? "Yes" : "No");
    #endif

    #if CUDA_VERSION >= 3020
	    int tccDriver ;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &tccDriver ,  CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev ) );
		printf("  Device is using TCC driver mode:               %s\n", tccDriver ? "Yes" : "No");
    #endif
	}
    printf("\nPASSED\n");
    CUT_EXIT(argc, argv);
}
Ejemplo n.º 4
0
void swanRunKernelAsync( const char *kernel,  block_config_t grid , block_config_t block, size_t shmem, int flags, void *ptrs[], int *types  ) {
	// find the kernel

	if( !grid.x || !grid.y || !grid.z || !block.x || !block.y || !block.z ) { return; } // suppress launch of kernel if any of the launch dims are 0

	CUfunction f = NULL;
	int i;
	int offset = 0;
	CUresult err;

	int type;
	int idx=0;
	try_init();
	for( i=0; i < state.num_funcs; i++ ) {
		if( !strcmp( state.func_names[i], kernel ) ) {
			f = state.funcs[i];
			break;
		}
	}

	if( f == NULL ) {
		for( i=0; i < state.num_mods; i++ ) {
			cuModuleGetFunction( &f, state.mods[i], kernel );
			if( f!= NULL ) { 
				// found a kernel. store it for future use
				int j = state.num_funcs;
				state.num_funcs++;
				state.funcs      = (CUfunction*) realloc( state.funcs, sizeof(CUfunction) * state.num_funcs );
				state.funcs[j]   = f;
				state.func_names = (char**)      realloc( state.func_names, sizeof(char*) * state.num_funcs );
				state.func_names[j] = (char*) malloc( strlen(kernel) + 1 );
				strcpy( state.func_names[j], kernel );
				break; 
			}
		}
	}

	if( f== NULL ) {
		fprintf(stderr, "Error running kernel [%s] : \n", kernel );
		error( "No kernel found" );
	}

	if( grid.z != 1 ) {
		printf("Kernel [%s] launched with (%d %d %d)(%d %d %d)\n", kernel, grid.x, grid.y, grid.z, block.x, block.y, block.z );
		error( "grid.z needs to be 1" );
	}

//printf("Running kernel [%s]\n", kernel );

	type = types[idx];
	while( type != SWAN_END ) {
		void *ptr = ptrs[idx];
		switch( type ) {
//			DEBLOCK( SWAN_uchar, uchar,  1 );
			DEBLOCK( SWAN_uchar2, uchar2,  2 );
			DEBLOCK( SWAN_uchar3, uchar3,  1 );
			DEBLOCK( SWAN_uchar4, uchar4,  4 );
			DEBLOCK( SWAN_char , int,  1 );
//			DEBLOCK( SWAN_char1 , char1,  1 );
			DEBLOCK( SWAN_char2 , char2,  2 );
			DEBLOCK( SWAN_char3 , char3,  1 );
			DEBLOCK( SWAN_char4 , char4,  4 );
			DEBLOCK( SWAN_int, int,  4 );
//			DEBLOCK( SWAN_int1, int1,  4 );
			DEBLOCK( SWAN_int2, int2,  8 );
			DEBLOCK( SWAN_int3, int3,  4 );
			DEBLOCK( SWAN_int4, int4,  16 );
//			DEBLOCK( SWAN_float, double,  4 );
//			DEBLOCK( SWAN_float1, float1,  4 );
			DEBLOCK( SWAN_float2, float2,  8 );
			DEBLOCK( SWAN_float3, float3,  4 );
			DEBLOCK( SWAN_float4, float4,  16 );

			DEBLOCK( SWAN_uint, uint,  4 );
			DEBLOCK( SWAN_uint2, uint2,  8 );
			DEBLOCK( SWAN_uint3, uint3,  4 );
			DEBLOCK( SWAN_uint4, uint4,  16 );
			DEBLOCK( SWAN_float, float,  4 );


//#define DEBLOCK(swan_type,type,OFFSET) 
#if ( CUDA_MAJOR == 3 && CUDA_MINOR >= 2 ) || CUDA_MAJOR >= 4
			case SWAN_PTR: 
				{
//printf("PTR as NATIVE\n");
				ALIGN_UP( offset, (sizeof(void*)));
				cuParamSetv( f, offset, ptr, sizeof(void*) );
				offset += sizeof(void*); }
			break;
#else
			case SWAN_PTR: 
				{
//printf("PTR as INT\n");
				ALIGN_UP( offset, (sizeof(int)));
				cuParamSetv( f, offset, ptr, sizeof(int) );
				offset += sizeof(int); }
			break;
#endif



			default:
        printf("%d\n", type );
				error("Parameter type not handled\n");


		}
		idx++;
		type = types[idx];
	}

//printf("Launching kernel [%s] [%X]  with (%d %d %d) (%d %d %d)\n", kernel, f, grid.x, grid.y, grid.z, block.x, block.y, block.z );
//printf(" TOTAL OFFSET %d\n", offset );
	CU_SAFE_CALL_NO_SYNC( cuParamSetSize( f, offset ) );
	CU_SAFE_CALL_NO_SYNC( cuFuncSetBlockShape( f, block.x, block.y, block.z ) );
	CU_SAFE_CALL_NO_SYNC( cuFuncSetSharedSize( f, shmem ) );
#if (CUDA_MAJOR ==3 && CUDA_MINOR >=1 ) || CUDA_MAJOR>=4
	cuFuncSetCacheConfig( f, CU_FUNC_CACHE_PREFER_SHARED ); // This seems to be better in every case for acemd
#endif

	err = cuLaunchGridAsync( f, grid.x, grid.y, NULL ) ; //state.stream ) ;

	if( err != CUDA_SUCCESS ) {
		fprintf( stderr , "SWAN : FATAL : Failure executing kernel [%s] [%d] [%d,%d,%d][%d,%d,%d]\n", kernel, err, grid.x ,grid.y, grid.z, block.x, block.y, block.z );
	assert(0);
		exit(-99);
	}

//printf("Kernel completed\n" );
}